In [1]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import scipy.io.wavfile

import keras
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
class VCBSequence(keras.utils.Sequence):
    
    def __init__(self, partition, crop_length=2*16000, batch_size=512,
                 shuffle=True, random_seed=42,
                 sr=16000,
                 vcb_root="/home/ubuntu/data/voxceleb1_wav",
                 split_txt="/home/ubuntu/data/id-split.txt"):
        """
        partition: one of "train", "val", "test"
        crop_length: length in samples, -1 for no cropping
        """
        partition_map = {
            'train': 1,
            'val': 2,
            'test': 3
        }
        self.partition = partition_map[partition]
        
        self.sr = sr
        self.crop_length = crop_length
        self.batch_size = batch_size
        self.vcb_root = vcb_root
        self.split_txt = split_txt
        
        with open(self.split_txt) as f:
            lines = [line.rstrip() for line in f.readlines()]
        if shuffle:
            np.random.seed(random_seed)
            np.random.shuffle(lines)
        
        self.paths = [line.split(" ") for line in lines]
        for i in range(len(self.paths)):
            self.paths[i][0] = int(self.paths[i][0])
        
        self.all_names = [p[1].split("/")[0] for p in self.paths]
        self.encoder = LabelEncoder()
        self.encoder.fit(self.all_names)
        self.classes = np_utils.to_categorical(self.encoder.transform(self.all_names))
        keep = list(filter(lambda x: x[1][0] == self.partition, enumerate(self.paths)))
        keep = [x[0] for x in keep]
        self.classes = self.classes[keep]
            
        self.paths = list(filter(lambda x: x[0] == self.partition, self.paths))
        self.paths = [p[1] for p in self.paths]
        
    def __len__(self):
        ret = len(self.paths) // self.batch_size
        if len(self.paths) % len(self.paths) != 0:
            ret += 1
        return ret

    def get_random_cropped_wavfile(self, path):
        rate, data = scipy.io.wavfile.read(self.vcb_root + "/" + path)
        assert rate == self.sr
        if self.crop_length == -1:
            return data
        length = len(data)
        crop_start = np.random.randint(length - self.crop_length)
        crop = data[crop_start : crop_start + self.crop_length]
        return crop
    
    def __getitem__(self, idx):
        batch_paths = self.paths[idx * self.batch_size:(idx + 1) * self.batch_size]
        x = np.zeros((len(batch_paths), self.crop_length))
        for i, p in enumerate(batch_paths):
            x[i] = self.get_random_cropped_wavfile(p)
        
        y = self.classes[idx * self.batch_size : min((idx + 1) * self.batch_size,
                                                     self.classes.shape[0])]
        
        return x, y

In [3]:
import keras
from keras.models import Model
from keras.layers import Dense, Input
from utils.recurrent import GRU

import kapre
from kapre.time_frequency import Melspectrogram
from utils.normalization2d import Normalization2D

SR = 16000 # sampling rate
CROP_LENGTH = 128 * 256
BATCH_SIZE = 512
train_gen = VCBSequence('train', sr=SR, crop_length=CROP_LENGTH, batch_size=BATCH_SIZE)
val_data = VCBSequence('val', sr=SR, crop_length=CROP_LENGTH, batch_size=10000)[0]
#test_gen = VCBSequence('test', sr=SR, crop_length=CROP_LENGTH, batch_size=BATCH_SIZE)

N_MELS = 512
N_FFT = 1024
N_HOP = 128
INP = Input(shape=(None,))
CHAN_INP = keras.layers.Reshape((1, -1))(INP)
SPECT = Melspectrogram(sr=SR, n_mels=N_MELS, 
                          n_dft=N_FFT, n_hop=N_HOP, 
                          return_decibel_melgram=True, power_melgram=1.0,
                          trainable_kernel=False, name='melgram')(CHAN_INP)
SPECT = Normalization2D(int_axis=2)(SPECT)
SPECT = keras.layers.Reshape((N_MELS, -1))(SPECT)
SPECT = keras.layers.Permute((2, 1))(SPECT)

# Residual Stacked GRU
RSGRU_CONFIG = {
    "return_sequences": True,
    "use_bias": False,
    "use_ln": True,
    "implementation": 2,
    #"dropout": 0.5,
    #"recurrent_dropout": 0.5
}
RSGRU_1 = GRU(256, **RSGRU_CONFIG)(SPECT)
RSGRU_2 = GRU(256, **RSGRU_CONFIG)(RSGRU_1)
RSGRU_2p = keras.layers.add([RSGRU_2, RSGRU_1])
RSGRU_3 = GRU(256, **RSGRU_CONFIG)(RSGRU_2p)
RSGRU = keras.layers.add([RSGRU_3, RSGRU_2p])
POOLED = keras.layers.GlobalAveragePooling1D()(RSGRU)

NUM_PEOPLE = len(VCBSequence('test').encoder.classes_)
OUT = Dense(NUM_PEOPLE, activation='softmax')(POOLED)

model = Model(INP, OUT)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 1, None)      0           input_1[0][0]                    
__________________________________________________________________________________________________
melgram (Melspectrogram)        (None, 512, None, 1) 1313280     reshape_1[0][0]                  
__________________________________________________________________________________________________
normalization2d_1 (Normalizatio (None, 512, None, 1) 0           melgram[0][0]                    
__________________________________________________________________________________________________
reshape_2 

In [4]:
print(val_data[0].shape, val_data[1].shape)

(6904, 32768) (6904, 1251)


In [5]:
# model = keras.models.load_model("/home/ubuntu/models/rsgruln-02-0.12.hdf5",
#                                 custom_objects={
#                                     "Melspectrogram": Melspectrogram,
#                                     "Normalization2D": kapre.utils.Normalization2D
#                                 })
model.load_weights("/home/ubuntu/models/save/rsgruln-96-0.45.hdf5")

In [None]:
LEARNING_RATE = 1e-3
rmsprop = keras.optimizers.Adam(lr=LEARNING_RATE)

def top_k(y_true, y_pred, k):
    return keras.metrics.top_k_categorical_accuracy(y_tru, y_pred, k=k)

model.compile(optimizer=rmsprop,
              loss='categorical_crossentropy', # loss function to be optimized
              metrics=['accuracy']) # doesn't affect optimization
model.summary()

In [None]:
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from utils.custom_callbacks import TensorBoard

checkpoint_filepath="/home/ubuntu/models/rsgruln-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',
                             period=1)

tensorboard = TensorBoard(log_dir="/home/ubuntu/tflogs/cur",
                          write_batch_performance=True,
                          write_graph=False,
                          batch_size=BATCH_SIZE)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=1e-5)

In [None]:
model.fit_generator(train_gen, validation_data=val_data, epochs=500,
                    callbacks=[checkpoint, tensorboard, reduce_lr],
                    workers=5)

In [6]:
vcb_root="/home/ubuntu/data/voxceleb1_wav"
def get_wavfile(path):
    rate, data = scipy.io.wavfile.read(vcb_root + "/" + path)
    assert rate == 16000
    return data

class VCBSequenceTest(VCBSequence):
    
    def __init__(self, partition, crop_length=2*16000, batch_size=512,
                 shuffle=True, random_seed=42,
                 sr=16000,
                 vcb_root="/home/ubuntu/data/voxceleb1_wav",
                 split_txt="/home/ubuntu/data/id-split.txt"):
        """
        partition: one of "train", "val", "test"
        crop_length: length in samples, -1 for no cropping
        """
        partition_map = {
            'train': 1,
            'val': 2,
            'test': 3
        }
        self.partition = partition_map[partition]
        
        self.sr = sr
        self.crop_length = crop_length
        self.batch_size = batch_size
        self.vcb_root = vcb_root
        self.split_txt = split_txt
        
        with open(self.split_txt) as f:
            lines = [line.rstrip() for line in f.readlines()]
        if shuffle:
            np.random.seed(random_seed)
            np.random.shuffle(lines)
        
        self.paths = [line.split(" ") for line in lines]
        for i in range(len(self.paths)):
            self.paths[i][0] = int(self.paths[i][0])
        
        self.all_names = [p[1].split("/")[0] for p in self.paths]
        self.encoder = LabelEncoder()
        self.encoder.fit(self.all_names)
        self.classes = np_utils.to_categorical(self.encoder.transform(self.all_names))
        keep = list(filter(lambda x: x[1][0] == self.partition, enumerate(self.paths)))
        keep = [x[0] for x in keep]
        self.classes = self.classes[keep]
            
        self.paths = list(filter(lambda x: x[0] == self.partition, self.paths))
        self.paths = [p[1] for p in self.paths]
        
        
        print(self.classes.shape)
        with open('/home/ubuntu/asdfasdf_'+partition+'.txt', 'w') as f:
            for vec, path in zip(self.classes, self.paths):
                i = np.argmax(vec)
                f.write(path + ' ' + str(i) + '\n')
            f.write
        
        
        self.wavs = list(map(lambda x:np.array(get_wavfile(x)),self.paths))
        
        self.lens = list(map(lambda x:len(x),self.wavs))
        
        
        #self.lens, self.wavs, self.classes = zip(*sorted(zip(self.lens, self.wavs, self.classes)))
        self.lens, self.wavs, self.classes, self.paths = zip(*[(self.lens[i], self.wavs[i], self.classes[i], self.paths[i])
                                                   for i in sorted(range(len(self.lens)),
                                                                   key=lambda i: (self.lens[i]))])
        
        self.batch_sizes = [0]
        sec = 0
        
        self.clips = []
        
        for i, wav in enumerate(self.wavs):
            while self.lens[i]//16000 > sec:
                sec += 1
                self.batch_sizes += [0]
            self.batch_sizes[sec] += 1
            self.clips += [wav[:sec*16000]]
        
        self.wavs = self.clips
        
        self.ind = [0]
        self.crop = []
        for i, num in enumerate(self.batch_sizes):
            if num > 0:
                self.ind += [self.ind[len(self.ind)-1] + num]
                self.crop += [i*16000]
        
    def __len__(self):
        return len(self.ind) - 1
    
    
    def __getitem__(self, idx):
        batch_wavs = self.wavs[self.ind[idx]:self.ind[idx+1]]
        
        
        x = np.zeros((len(batch_wavs), self.crop[idx]))
        for i, p in enumerate(batch_wavs):
            x[i] = batch_wavs[i]
        
        y = self.classes[self.ind[idx]:self.ind[idx+1]]
        
        return x, y

In [7]:
val_gen = VCBSequenceTest('val', sr=SR, crop_length=-1)
test_gen = VCBSequenceTest('test', sr=SR, crop_length=-1)

(6904, 1251)
(8251, 1251)


In [14]:
def evaluateHelper(scores, speaker):
    return 1.0 * np.sum(np.argmax(scores, axis=1) - np.argmax(speaker, axis=1) == 0) / scores.shape[0]
def evaluate_top_k_helper(scores, speaker, k=5):
    total = 0
    correct = 0
    with open('/home/ubuntu/rnn_outputs.txt', 'w') as f:
        f.write('path, output class, actual class, rank\n')

    for i, score in enumerate(scores):
        sp = np.argmax(speaker[i])
        if sum(score>score[sp]) < k:
            correct += 1
        total += 1
#             f.write('{} {} {} {}\n'.format(test_gen.paths[i], np.argmax(score), sp, sum(score>=score[sp])))
    return 1.0 * correct / total
        

In [9]:
val_pred = model.predict_generator(val_gen, verbose=1, workers=5)
test_pred = model.predict_generator(test_gen, verbose=1, workers=5)



In [10]:
def evaluate_top_k(data_gen, y_pred, k):
    y = []
    for i in range(len(data_gen)):
        xx, yy = data_gen[i]
        y += yy
    print(evaluateHelper(y_pred, y))
    print(evaluate_top_k_helper(y_pred, y,k=k))

In [15]:
evaluate_top_k(val_gen, val_pred, k=5)
evaluate_top_k(test_gen, test_pred, k=5)

0.597769409038
0.7708574739281576
0.581020482366
0.7593019027996607
