# Neural Network Models for Speaker Classification

Last Update: 12/01/2019 by Chen Liang

This notebook includes the following deep learning models: 1) CNN on MFCC 2) CNN on Chroma

In [68]:
import librosa
import numpy as np
import shutil
import os
from joblib import Parallel, delayed
import multiprocessing 
import matplotlib.pyplot as plt
import scipy
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split

In [42]:
raw_dir='recordings/selected_recordings/'
silence_removed_dir='recordings/silence_removed/'
unified_dir='recordings/unified/'

## Preparation: Data Understanding

For this dataset, each speaker is required to read the following paragraph:
>*Please call Stella.  Ask her to bring these things with her from the store:  Six spoons of fresh snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob.  We also need a small plastic snake and a big toy frog for the kids.  She can scoop these things into three red bags, and we will go meet her Wednesday at the train station.*

Which has the following phonemes

## Convolutional Neural Network on MFCC

Key idea: Use filters to extract key features from a series of adjacent MFCC slices

In [233]:
y,sr=librosa.load(unified_dir+'spanish6.mp3.wav')
librosa.feature.chroma_stft(y=y, sr=sr).shape

(12, 861)

In [41]:
y_stretched[:sr]

array([0.02146304, 0.01559818, 0.0085238 , ..., 0.05236012, 0.05111545,
       0.04814873], dtype=float32)

In [31]:
y,sr=librosa.load(unified_dir+'spanish12.mp3.wav')
print(sr)
print(librosa.get_duration(y=y,sr=sr))
result1=librosa.feature.mfcc(y=y,hop_length=int(sr/100), n_fft=int(sr/40))

22050
24.938231292517006


In [32]:
y,sr=librosa.load(unified_dir+'spanish6.mp3.wav')
print(librosa.get_duration(y=y,sr=sr))
result2=librosa.feature.mfcc(y=y,hop_length=int(sr/100), n_fft=int(sr/40))

21.687437641723356


In [39]:
result1.shape

(20, 2055)

In [34]:
result2.shape

(20, 2174)

In [43]:
y,sr=librosa.load(unified_dir+'spanish12.mp3.wav')
print(sr)
print(librosa.get_duration(y=y,sr=sr))
result1=librosa.feature.mfcc(y=y,hop_length=int(sr/100), n_fft=int(sr/40))
result1=librosa.feature.mfcc(y=y,hop_length=int(sr/100), n_fft=int(sr/40))

22050
19.989977324263037


### Extract MFCC

In [46]:
result1.shape

(20, 2004)

In [61]:
mfcc_window_size=0.025
mfcc_stride_size=0.01
mfcc_num_of_features=14

In [62]:
def extract_mfcc(fname):
    y,sr=librosa.load(unified_dir+fname)
    result=librosa.feature.mfcc(y=y,n_mfcc=mfcc_num_of_features,hop_length=int(mfcc_window_size*sr), n_fft=int(mfcc_stride_size*sr))
    result=librosa.util.normalize(result)
    #Add label as well
    if 'mandarin' in fname:
        label=0
    elif 'english' in fname:
        label=1
    elif 'spanish' in fname:
        label=2
    else:
        label=-1
    return [result,label]

In [63]:
result=Parallel(n_jobs=12)(delayed(extract_mfcc)(fname) for fname in os.listdir(unified_dir))

In [177]:
X=[]
y=[]
for i in result:
    X.append(i[0])
    y.append(i[1])

### Set up CNN Model

<b> Model 1:</b> Directly fit on extracted MFCC features. Input shape is (14,800)

In [166]:
#input_shape=X_all[:,:,0].shape
input_shape=(14,800,1)
print(input_shape)

(14, 800, 1)


In [178]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.1)
X_train=np.expand_dims(X_train,axis=3)
X_test=np.expand_dims(X_test,axis=3)

In [179]:
def change_input_format(X):
    Xc=np.stack(X,axis=0)
#     Xc=np.swapaxes(Xc,0,1)
#     Xc=np.swapaxes(Xc,1,2)
    return Xc

In [180]:
X_train=change_input_format(X_train)
X_test=change_input_format(X_test)
y_train=np.array(y_train)
y_test=np.array(y_test)

In [182]:
X_test.shape

(60, 14, 800, 1)

In [185]:
y_train.shape

(540,)

In [199]:
def create_model1():
    model1=models.Sequential()
    model1.add(layers.Conv2D(32,kernel_size=(3,3),strides=(1,1),activation='relu',input_shape=input_shape))
    model1.add(layers.MaxPooling2D((2, 2)))
    model1.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model1.add(layers.MaxPooling2D((2, 2)))
    model1.add(layers.Flatten())
    model1.add(layers.Dense(64, activation='relu'))
    model1.add(layers.Dense(3, activation='softmax'))
    return model1

In [200]:
model1=create_model1()
model1.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
history = model1.fit(X_train, y_train, epochs=50, 
                    validation_data=(X_test, y_test))

Train on 540 samples, validate on 60 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [225]:
def create_model2():
    model1=models.Sequential()
    model1.add(layers.Conv2D(32,kernel_size=(5,5),strides=(1,1),activation='relu',input_shape=input_shape))
    model1.add(layers.MaxPooling2D((2, 2)))
    model1.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model1.add(layers.MaxPooling2D((2, 2)))
    model1.add(layers.Flatten())
    model1.add(layers.Dense(64, activation='relu'))
    model1.add(layers.Dense(3, activation='softmax'))
    return model1

In [229]:
print(y_test)

[0 1 2 2 0 1 1 1 1 1 1 2 1 1 1 2 1 2 2 1 1 1 2 2 2 1 1 1 2 2 1 2 2 1 1 2 2
 0 2 0 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 0 1 1 2 1]


In [226]:
model2=create_model2()
model2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
history = model2.fit(X_train, y_train, epochs=50, 
                    validation_data=(X_test, y_test))

Train on 540 samples, validate on 60 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [227]:
def create_model3():
    model1=models.Sequential()
    model1.add(layers.Conv2D(32,kernel_size=(14,5),strides=(1,1),activation='relu',input_shape=input_shape))
    model1.add(layers.MaxPooling2D((1, 2)))
    model1.add(layers.Flatten())
    model1.add(layers.Dense(32, activation='relu'))
    model1.add(layers.Dense(3, activation='softmax'))
    return model1

In [228]:
model3=create_model3()
model3.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
history = model3.fit(X_train, y_train, epochs=50, 
                    validation_data=(X_test, y_test))

Train on 540 samples, validate on 60 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


## Convolutional Neural Network on Chroma

### Extract Chroma

In [None]:
def get_chroma(fname):
    y,sr=librosa.load(unified_dir+fname)
    return librosa.feature.chroma_stft(y=y, sr=sr).shape