In [1]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import scipy.io.wavfile

import keras
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
class VCBSequence(keras.utils.Sequence):
    
    def __init__(self, partition, crop_length=3, batch_size=512,
                 shuffle=True, random_seed=42,
                 sr=16000,
                 vcb_root="/home/ubuntu/data/voxceleb1_wav",
                 split_txt="/home/ubuntu/data/id-split.txt"):
        """
        partition: one of "train", "val", "test"
        crop_length: length to crop in seconds (int)
        """
        partition_map = {
            'train': 1,
            'val': 2,
            'test': 3
        }
        self.partition = partition_map[partition]
        
        self.sr = sr
        self.crop_length = crop_length
        self.batch_size = batch_size
        self.vcb_root = vcb_root
        self.split_txt = split_txt
        
        with open(self.split_txt) as f:
            lines = [line.rstrip() for line in f.readlines()]
        if shuffle:
            np.random.seed(random_seed)
            np.random.shuffle(lines)
        
        self.paths = [line.split(" ") for line in lines]
        for i in range(len(self.paths)):
            self.paths[i][0] = int(self.paths[i][0])
        
        self.all_names = [p[1].split("/")[0] for p in self.paths]
        self.encoder = LabelEncoder()
        self.encoder.fit(self.all_names)
        self.classes = np_utils.to_categorical(self.encoder.transform(self.all_names))
        keep = list(filter(lambda x: x[1][0] == self.partition, enumerate(self.paths)))
        keep = [x[0] for x in keep]
        self.classes = self.classes[keep]
            
        self.paths = list(filter(lambda x: x[0] == self.partition, self.paths))
        self.paths = [p[1] for p in self.paths]
        
    def __len__(self):
        ret = len(self.paths) // self.batch_size
        if len(self.paths) % len(self.paths) != 0:
            ret += 1
        return ret

    def get_random_cropped_wavfile(self, path):
        rate, data = scipy.io.wavfile.read(self.vcb_root + "/" + path)
        assert rate == self.sr
        length = len(data)
        crop_start = np.random.randint(length - self.crop_length * self.sr)
        crop = data[crop_start : crop_start + self.crop_length * self.sr]
        return crop
    
    def __getitem__(self, idx):
        batch_paths = self.paths[idx * self.batch_size:(idx + 1) * self.batch_size]
        x = np.zeros((len(batch_paths), self.sr * self.crop_length))
        for i, p in enumerate(batch_paths):
            x[i] = self.get_random_cropped_wavfile(p)
        
        y = self.classes[idx * self.batch_size : min((idx + 1) * self.batch_size,
                                                     self.classes.shape[0])]
        
        return x, y

In [3]:
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import ModelCheckpoint
from utils.custom_callbacks import TensorBoard

import kapre
from kapre.time_frequency import Melspectrogram


HIDDEN_DIM = 256

model = Sequential()

SR = 16000 # sampling rate
CROP_LENGTH = 2
BATCH_SIZE = 512
train_gen = VCBSequence('train', sr=SR, crop_length=CROP_LENGTH, batch_size=BATCH_SIZE)
val_data = VCBSequence('val', sr=SR, crop_length=CROP_LENGTH, batch_size=10000)[0]
test_gen = VCBSequence('test', sr=SR, crop_length=CROP_LENGTH, batch_size=BATCH_SIZE)
model.add(keras.layers.Reshape((1, CROP_LENGTH * SR), input_shape=(CROP_LENGTH * SR,)))

N_MELS = 128
N_FFT = 512
N_HOP = 128
model.add(Melspectrogram(sr=SR, n_mels=N_MELS, 
      n_dft=N_FFT, n_hop=N_HOP, 
      return_decibel_melgram=True, power_melgram=2.0,
      trainable_kernel=False, name='melgram'))

model.add(kapre.utils.Normalization2D(int_axis=0))
model.add(keras.layers.Reshape((N_MELS, 250)))
model.add(keras.layers.Permute((2, 1)))
model.add(LSTM(HIDDEN_DIM, return_sequences=True))
model.add(keras.layers.GlobalAveragePooling1D())

NUM_PEOPLE = len(VCBSequence('test').encoder.classes_)
model.add(Dense(NUM_PEOPLE, activation='softmax'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_1 (Reshape)          (None, 1, 32000)          0         
_________________________________________________________________
melgram (Melspectrogram)     (None, 128, 250, 1)       296064    
_________________________________________________________________
normalization2d_1 (Normaliza (None, 128, 250, 1)       0         
_________________________________________________________________
reshape_2 (Reshape)          (None, 128, 250)          0         
_________________________________________________________________
permute_1 (Permute)          (None, 250, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 250, 256)          394240    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 256)               0         
__________

In [4]:
print(val_data[0].shape, val_data[1].shape)

(6904, 32000) (6904, 1251)


In [5]:
LEARNING_RATE = 1e-3
rmsprop = keras.optimizers.Adam(lr=LEARNING_RATE)

model.compile(optimizer=rmsprop,
              loss='categorical_crossentropy', # loss function to be optimized
              metrics=['accuracy']) # doesn't affect optimization

In [6]:
checkpoint_filepath="/home/ubuntu/models/2second-crop-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

tensorboard = TensorBoard(log_dir="/home/ubuntu/tflogs/cur",
                          write_batch_performance=True,
                          histogram_freq=1,
                          write_grads=True,
                          batch_size=BATCH_SIZE)

In [7]:
model.fit_generator(train_gen, validation_data=val_data, epochs=10,
                    callbacks=[checkpoint, tensorboard],
                    workers=3)

INFO:tensorflow:Summary name melgram/real_kernels:0 is illegal; using melgram/real_kernels_0 instead.
INFO:tensorflow:Summary name melgram/real_kernels:0_grad is illegal; using melgram/real_kernels_0_grad instead.
INFO:tensorflow:Summary name melgram/imag_kernels:0 is illegal; using melgram/imag_kernels_0 instead.
INFO:tensorflow:Summary name melgram/imag_kernels:0_grad is illegal; using melgram/imag_kernels_0_grad instead.
INFO:tensorflow:Summary name melgram/Variable:0 is illegal; using melgram/Variable_0 instead.
INFO:tensorflow:Summary name melgram/Variable:0_grad is illegal; using melgram/Variable_0_grad instead.
INFO:tensorflow:Summary name lstm_1/kernel:0 is illegal; using lstm_1/kernel_0 instead.
INFO:tensorflow:Summary name lstm_1/kernel:0_grad is illegal; using lstm_1/kernel_0_grad instead.
INFO:tensorflow:Summary name lstm_1/recurrent_kernel:0 is illegal; using lstm_1/recurrent_kernel_0 instead.
INFO:tensorflow:Summary name lstm_1/recurrent_kernel:0_grad is illegal; using ls

KeyboardInterrupt: 

In [None]:
model.evaluate_generator(train_gen)

In [None]:
model.evaluate_generator(val_gen)

In [None]:
model.evaluate_generator(test_gen)

In [None]:
np.arange(10).reshape(5, 2)[[0,2,4]]