In [2]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import scipy.io.wavfile

import keras
from keras.utils import np_utils

In [3]:
class VCBSequence(keras.utils.Sequence):
    
    def __init__(self, partition, crop_length=3, batch_size=512,
                 shuffle=True, random_seed=42,
                 sr=16000,
                 vcb_root="/home/ubuntu/data/voxceleb1_wav",
                 split_txt="/home/ubuntu/data/id-split.txt"):
        """
        partition: one of "train", "val", "test"
        crop_length: length to crop in seconds (int)
        """
        partition_map = {
            'train': 1,
            'val': 2,
            'test': 3
        }
        self.partition = partition_map[partition]
        
        self.sr = sr
        self.crop_length = crop_length
        self.batch_size = batch_size
        self.vcb_root = vcb_root
        self.split_txt = split_txt
        
        with open(self.split_txt) as f:
            lines = [line.rstrip() for line in f.readlines()]
        if shuffle:
            np.random.seed(random_seed)
            np.random.shuffle(lines)
        
        self.paths = [line.split(" ") for line in lines]
        for i in range(len(self.paths)):
            self.paths[i][0] = int(self.paths[i][0])
        
        self.all_names = [p[1].split("/")[0] for p in self.paths]
        self.encoder = LabelEncoder()
        self.encoder.fit(self.all_names)
        self.classes = np_utils.to_categorical(self.encoder.transform(self.all_names))
        keep = list(filter(lambda x: x[1][0] == self.partition, enumerate(self.paths)))
        keep = [x[0] for x in keep]
        self.classes = self.classes[keep]
            
        self.paths = list(filter(lambda x: x[0] == self.partition, self.paths))
        self.paths = [p[1] for p in self.paths]
        
    def __len__(self):
        ret = len(self.paths) // self.batch_size
        if len(self.paths) % len(self.paths) != 0:
            ret += 1
        return ret

    def get_random_cropped_wavfile(self, path):
        rate, data = scipy.io.wavfile.read(self.vcb_root + "/" + path)
        assert rate == self.sr
        length = len(data)
        crop_start = np.random.randint(length - self.crop_length * self.sr)
        crop = data[crop_start : crop_start + self.crop_length * self.sr]
        return crop
    
    def __getitem__(self, idx):
        batch_paths = self.paths[idx * self.batch_size:(idx + 1) * self.batch_size]
        x = np.zeros((len(batch_paths), self.sr * self.crop_length))
        for i, p in enumerate(batch_paths):
            x[i] = self.get_random_cropped_wavfile(p)
        
        y = self.classes[idx * self.batch_size : min((idx + 1) * self.batch_size,
                                                     self.classes.shape[0])]
        
        return x, y

In [31]:
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Conv2D, MaxPooling2D, AveragePooling2D

import kapre
from kapre.time_frequency import Melspectrogram


HIDDEN_DIM = 256

model = Sequential()

SR = 16000 # sampling rate
CROP_LENGTH = 2
BATCH_SIZE = 512
#train_gen = VCBSequence('train', sr=SR, crop_length=CROP_LENGTH, batch_size=BATCH_SIZE)
#val_data = VCBSequence('val', sr=SR, crop_length=CROP_LENGTH, batch_size=10000)[0]
#test_gen = VCBSequence('test', sr=SR, crop_length=CROP_LENGTH, batch_size=BATCH_SIZE)
model.add(keras.layers.Reshape((1, CROP_LENGTH * SR), input_shape=(CROP_LENGTH * SR,)))

N_MELS = 128
N_FFT = 512
N_HOP = 128
model.add(Melspectrogram(sr=SR, n_mels=N_MELS, 
      n_dft=N_FFT, n_hop=N_HOP, 
      return_decibel_melgram=True, power_melgram=2.0,
      trainable_kernel=False, name='melgram'))
model.add(kapre.utils.Normalization2D(int_axis=2))
model.add(Conv2D(96, 7, strides=(2,2), activation='relu'))
model.add(MaxPooling2D(pool_size=(3,3), strides=(2,2)))
model.add(Conv2D(256, 5, strides=(2,2), activation='relu'))
model.add(MaxPooling2D(pool_size=(3,3), strides=(2,2)))
model.add(Conv2D(256, 3, activation='relu', padding='same'))
model.add(Conv2D(256, 3, activation='relu', padding='same'))
model.add(Conv2D(256, 3, activation='relu', padding='same'))
model.add(MaxPooling2D(pool_size=(3,3), strides=(1,1)))
model.add(Conv2D(4096, (4,1)))
model.add(AveragePooling2D(pool_size=(1,11)))
model.add(keras.layers.Reshape((4096,)))
model.add(Dense(1024, activation='relu'))
NUM_PEOPLE = len(VCBSequence('test').encoder.classes_)
model.add(Dense(NUM_PEOPLE, activation='softmax'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_29 (Reshape)         (None, 1, 32000)          0         
_________________________________________________________________
melgram (Melspectrogram)     (None, 128, 250, 1)       296064    
_________________________________________________________________
normalization2d_23 (Normaliz (None, 128, 250, 1)       0         
_________________________________________________________________
conv2d_98 (Conv2D)           (None, 61, 122, 96)       4800      
_________________________________________________________________
max_pooling2d_53 (MaxPooling (None, 30, 60, 96)        0         
_________________________________________________________________
conv2d_99 (Conv2D)           (None, 13, 28, 256)       614656    
_________________________________________________________________
max_pooling2d_54 (MaxPooling (None, 6, 13, 256)        0         
__________

In [4]:
print(val_data[0].shape, val_data[1].shape)

(6904, 32000) (6904, 1251)


In [6]:
model = keras.models.load_model("/home/ubuntu/models/save/2second-crop-05-0.28.hdf5",
                                custom_objects={
                                    "Melspectrogram": Melspectrogram,
                                    "Normalization2D": kapre.utils.Normalization2D
                                })

In [None]:
LEARNING_RATE = 1e-3
rmsprop = keras.optimizers.Adam(lr=LEARNING_RATE)

model.compile(optimizer=rmsprop,
              loss='categorical_crossentropy', # loss function to be optimized
              metrics=['accuracy']) # doesn't affect optimization

In [7]:
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from utils.custom_callbacks import TensorBoard

checkpoint_filepath="/home/ubuntu/models/2second-crop-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

tensorboard = TensorBoard(log_dir="/home/ubuntu/tflogs/cur",
                          write_batch_performance=True,
                          write_graph=False,
                          histogram_freq=1,
                          write_grads=True,
                          batch_size=BATCH_SIZE)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=1e-5)

In [None]:
model.fit_generator(train_gen, validation_data=val_data, epochs=30,
                    callbacks=[checkpoint, tensorboard, reduce_lr],
                    workers=3)

INFO:tensorflow:Summary name melgram_1/real_kernels:0 is illegal; using melgram_1/real_kernels_0 instead.
INFO:tensorflow:Summary name melgram_1/real_kernels:0_grad is illegal; using melgram_1/real_kernels_0_grad instead.
INFO:tensorflow:Summary name melgram_1/imag_kernels:0 is illegal; using melgram_1/imag_kernels_0 instead.
INFO:tensorflow:Summary name melgram_1/imag_kernels:0_grad is illegal; using melgram_1/imag_kernels_0_grad instead.
INFO:tensorflow:Summary name melgram_1/Variable:0 is illegal; using melgram_1/Variable_0 instead.
INFO:tensorflow:Summary name melgram_1/Variable:0_grad is illegal; using melgram_1/Variable_0_grad instead.
INFO:tensorflow:Summary name lstm_1_1/kernel:0 is illegal; using lstm_1_1/kernel_0 instead.
INFO:tensorflow:Summary name lstm_1_1/kernel:0_grad is illegal; using lstm_1_1/kernel_0_grad instead.
INFO:tensorflow:Summary name lstm_1_1/recurrent_kernel:0 is illegal; using lstm_1_1/recurrent_kernel_0 instead.
INFO:tensorflow:Summary name lstm_1_1/recurr

In [None]:
model.evaluate_generator(train_gen)

In [None]:
model.evaluate_generator(val_gen)

In [None]:
model.evaluate_generator(test_gen)

In [None]:
np.arange(10).reshape(5, 2)[[0,2,4]]