In [1]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import scipy.io.wavfile

from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
def get_label_encoder(vcb_root="/home/ubuntu/data/voxceleb1_wav",
                      split_txt="/home/ubuntu/data/id-split.txt"):
    with open(split_txt) as f:
        names = [line.rstrip().split(" ")[1].split("/")[0] for line in f.readlines()]

    encoder = LabelEncoder()
    encoder.fit(names)
    return encoder

ENCODER = get_label_encoder()

In [3]:
def get_random_cropped_wavfile(path, crop_length=3):
    """
    length is in seconds (int)
    """
    rate, data = scipy.io.wavfile.read(path)
    length = len(data)
    crop_start = np.random.randint(length - crop_length * rate)
    crop = data[crop_start : crop_start + crop_length * rate]
    return crop

def get_data(partition,
             chunk_size=70000,
             vcb_root="/home/ubuntu/data/voxceleb1_wav",
             split_txt="/home/ubuntu/data/id-split.txt",
             prefix=None):
    labels = {
        'train': 1,
        'val': 2,
        'test': 3
    }
    label = labels[partition]
        
    with open(split_txt) as f:
        paths = [line.rstrip().split(" ") for line in f.readlines()]
        if prefix is not None:
            paths = paths[:prefix]

    for i in range(len(paths)):
        paths[i][0] = int(paths[i][0])
    
    paths = list(filter(lambda x: x[0] == label, paths))
    paths = [p[1] for p in paths]
    names = [p.split("/")[0] for p in paths]
    classes = np_utils.to_categorical(ENCODER.transform(names))
    
    for i in range(0, len(paths), chunk_size):
        cur_paths = paths[i : min(len(paths), i + chunk_size)]
        cur_names = classes[i : min(len(paths), i + chunk_size)]
    
    wavs = np.zeros((len(paths), 48000))
    for i, p in enumerate(paths):
        wavs[i] = get_random_cropped_wavfile(vcb_root + "/" + p)
        if i % 5000 == 4999:
            print(partition, i + 1)
    
    return wavs, classes

In [4]:
x_train, y_train = get_data('train')
x_val, y_val = get_data('val')
x_test, y_test = get_data('test')

train 5000
train 10000
train 15000
train 20000
train 25000
train 30000
train 35000
train 40000
train 45000
train 50000
train 55000
train 60000
train 65000
train 70000
train 75000
train 80000
train 85000
train 90000
train 95000
train 100000
train 105000
train 110000
train 115000
train 120000
train 125000
train 130000
train 135000
val 5000
test 5000


In [5]:
print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)
print(x_test.shape, y_test.shape)

(138327, 48000) (138327, 1251)
(6904, 48000) (6904, 1251)
(8251, 48000) (8251, 1251)


In [28]:
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import ModelCheckpoint

import kapre
from kapre.time_frequency import Melspectrogram


HIDDEN_DIM = 256
LEARNING_RATE = 0.002

model = Sequential()

sr = 16000 # sampling rate
n_mels = 128
n_fft = 512
n_hop = 128
model.add(keras.layers.Reshape((1, 3 * sr), input_shape=(3 * sr,)))
model.add(Melspectrogram(sr=16000, n_mels=n_mels, 
      n_dft=n_fft, n_hop=n_hop, 
      return_decibel_melgram=True, power_melgram=2.0,
      trainable_kernel=False, name='melgram'))


NUM_PEOPLE = len(ENCODER.classes_)
model.add(kapre.utils.Normalization2D(int_axis=0))
model.add(keras.layers.Reshape((n_mels, 375)))
model.add(keras.layers.Permute((2, 1)))
model.add(LSTM(HIDDEN_DIM, return_sequences=True))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(Dense(NUM_PEOPLE, activation='softmax'))

rmsprop = keras.optimizers.RMSprop(lr=LEARNING_RATE, rho=0.9, epsilon=1e-08, decay=0.0)

model.compile(optimizer=rmsprop,
              loss='categorical_crossentropy', # loss function to be optimized
              metrics=['accuracy']) # doesn't affect optimization

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_17 (Reshape)         (None, 1, 48000)          0         
_________________________________________________________________
melgram (Melspectrogram)     (None, 128, 375, 1)       296064    
_________________________________________________________________
normalization2d_12 (Normaliz (None, 128, 375, 1)       0         
_________________________________________________________________
reshape_18 (Reshape)         (None, 128, 375)          0         
_________________________________________________________________
permute_9 (Permute)          (None, 375, 128)          0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 375, 256)          394240    
_________________________________________________________________
global_average_pooling1d_9 ( (None, 256)               0         
__________

In [29]:
filepath="/home/ubuntu/models/weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=5, batch_size=512, shuffle=True,
          callbacks=[checkpoint])

Train on 138327 samples, validate on 6904 samples
Epoch 1/5


KeyboardInterrupt: 

In [None]:
def evaluate(audio, speaker):
    scores = model.predict(audio, batch_size=256)
    return 1.0 * np.sum(np.argmax(scores, axis=1) - np.argmax(speaker, axis=1) == 0) / scores.shape[0]

In [None]:
print("Train:", evaluate(x_train, y_train))

In [None]:
print("Val:", evaluate(x_val, y_val))

In [None]:
print("Test:", evaluate(x_test, y_test))