Mount google drive and copy and unzip the meld dataset.

Also install the python_speech_features library used for extracting mfcc features

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!cp /content/drive/My\ Drive/meld/emotion.zip .
!unzip emotion.zip
!pip install python_speech_features

Imports

In [0]:
from python_speech_features import mfcc
import os
from scipy.io import wavfile
import pickle
import torch.optim as optim
import torch.nn as nn
import torch
import torch.nn.functional as F
import pandas as pd
from sklearn.utils import shuffle
from tqdm import tqdm, tqdm_notebook, tnrange
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import math
import numpy as np

These 2 large files are removed and not used.

In [0]:
!rm meld/train/disgust/MEL_dia220_utt0_negative_DIS.wav
!rm meld/val/happy/MEL_dia38_utt4_negative_HAP.wav

set pytorch device to gpu if available otherwise to cpu


In [0]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
torch.cuda.set_device(device)

get class(emotions) labels

In [0]:
train_folder = './meld/train/'
valid_folder = './meld/val/'
classes = [x[0].split('/')[-1] for x in os.walk(train_folder) if x[0].split('/')[-1] != '']
print(classes)

Used the default values of the mfcc function to get numcep(13) no of features from a 25ms(winlen) long sound windows sampled at a step of 10ms(winstep). 

The number of mfcc features used are 26 and they are reduced to 13 features by discrete cosine reduction to remove corelation.

The sample rate is reduced to 16000 (this will capture sounds upto 8000 Hz).

```
def mfcc(signal, samplerate=16000, winlen=0.025, winstep=0.01,numcep=13, nfilt=26, nfft=512)
```



In [0]:
def get_mfcc_features(file):
    rate, signal = wavfile.read(file)
    return mfcc(signal)

mfcc features corresponding to each audio file in train and valid set are stored train_feature_dict and valid_feature_dict respectively

In [0]:
train_feature_dict = {}

for c in classes:
    files = [x for x in os.listdir(train_folder + c)]
  for file in files:
    train_feature_dict[file] = get_mfcc_features(train_folder + c + "/" + file)

valid_feature_dict = {}
for c in classes:
  files = [x for x in os.listdir(valid_folder + c)]
  for file in files:
    valid_feature_dict[file] = get_mfcc_features(valid_folder + c + "/" + file)

In [0]:
print(len(train_feature_dict))
print(len(valid_feature_dict))

7353
829


# Normalise

Minimum and Maximum of the 13 features are calculated for the entire dataset. Mean and Standard Deviation is also calculated

In [0]:
feature_sum = [0.0 for _ in range(13)]
total = 0.0

feature_max_value = [-100000.0 for _ in range(13)]
feature_min_value = [100000.0 for _ in range(13)]

for key, val in train_feature_dict.items():
  total += val.shape[0]
  feature_sum += val.sum(axis = 0)
  feature_max_value = np.maximum(val.max(axis=0), feature_max_value)
  feature_min_value = np.minimum(val.min(axis=0), feature_min_value)

mean = feature_sum / total
print(total)
print(mean)
print(feature_max_value)
print(feature_min_value)
variance = [0.0 for _ in range(13)]
square_diff_sum = 0.0
for key, val in train_feature_dict.items():
  square_diff_sum += ((val - mean)**2).sum(axis=0)

standard_deviation = (square_diff_sum / total)**0.5
print(standard_deviation)

4662482.0
[ 15.82453726   1.36319049  -8.39165081  -9.98267385   5.67570697
  -9.23731186  -6.76188475  -0.0252859  -12.45081199  -0.9668685
 -11.45884017  -1.75013094  -4.48973996]
[23.94766958 35.47760648 32.407666   82.12595251 85.63384494 62.8985791
 61.30809941 79.46652656 59.85923447 70.58265384 43.68436301 58.37647754
 54.0187414 ]
[-36.04365339 -58.63459727 -57.64907143 -80.63907682 -81.08503093
 -85.38878632 -75.72724483 -76.89883101 -92.74224276 -90.46006047
 -75.38062188 -70.24483321 -79.46963393]
[ 2.71048281 10.95529956  9.06456857 17.6678814  14.01453595 14.85012859
 11.58754407 13.94876534 13.54238226 13.30045671 10.39980429 10.72943595
 10.62327105]


Normalising the mfcc features and converting them to numbers between 0 to 1

train_feature_dict_normalised stores the training features
valid_feature_dict_normalised stores the valid features

In [0]:
train_feature_dict_normalised = {}
for key, val in train_feature_dict.items():
  train_feature_dict_normalised[key] = (val - feature_min_value)/(feature_max_value-feature_min_value)

valid_feature_dict_normalised = {}
for key, val in valid_feature_dict.items():
  valid_feature_dict_normalised[key] = (val - feature_min_value)/(feature_max_value-feature_min_value)

# Saving and loading the mfcc features

In [0]:
with open('train_mfcc_features.pickle', 'wb') as handle:
    pickle.dump(train_feature_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('valid_mfcc_features.pickle', 'wb') as handle:
    pickle.dump(valid_feature_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
with open('train_mfcc_features_normalised.pickle', 'wb') as handle:
    pickle.dump(train_feature_dict_normalised, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('valid_mfcc_features_normalised.pickle', 'wb') as handle:
    pickle.dump(valid_feature_dict_normalised, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
import pickle
train_feature_dict = {}
with open('train_mfcc_features.pickle', 'rb') as handle:
    train_feature_dict = pickle.load(handle)

valid_feature_dict = {}
with open('valid_mfcc_features_normalised.pickle', 'rb') as handle:
    valid_feature_dict = pickle.load(handle)

# One hot encodding the labels

In [0]:
train_file_dict = {}
for c in classes:
  train_file_dict.update({x : classes.index(c) for x in os.listdir(train_folder + c)})
train_file_df = pd.DataFrame(list(train_file_dict.items()), columns=['FName', 'Label'])
train_file_df = shuffle(train_file_df)
train_file_df.head()

Unnamed: 0,FName,Label
556,MEL_dia548_utt0_positive_HAP.wav,0
574,MEL_dia499_utt0_positive_HAP.wav,0
6618,MEL_dia796_utt5_negative_FEA.wav,3
5300,MEL_dia575_utt1_neutral_NEU.wav,1
5163,MEL_dia1035_utt4_neutral_NEU.wav,1


In [0]:
valid_file_dict = {}
for c in classes:
  valid_file_dict.update({x : classes.index(c) for x in os.listdir(valid_folder + c)})
valid_file_df = pd.DataFrame(list(valid_file_dict.items()), columns=['FName', 'Label'])
valid_file_df = shuffle(valid_file_df)
valid_file_df.head()

Unnamed: 0,FName,Label
494,MEL_dia13_utt1_negative_NEU.wav,1
725,MEL_dia37_utt3_negative_FEA.wav,3
751,MEL_dia84_utt5_negative_SAD.wav,4
160,MEL_dia55_utt13_positive_HAP.wav,0
486,MEL_dia62_utt11_neutral_NEU.wav,1


# Creating PyTorch Dataset and DataLoader

As the audio files are of different sizes the number of windows sampled for mfcc features are different. To make them of same size for making bacthes, the train and valid features are padded with zeros at the end.

The maximum sequence length was little less than 10000.


```
torch.cat([torch.tensor(self.feature_dict[file], dtype=torch.float64), torch.zeros((max-self.feature_dict[file].shape[0], 13), dtype=torch.float64)]
```
This line append zeros to the end of the features


In [0]:
max = 10000
class SpeechDataset(Dataset):
    def __init__(self, df, feature_dict):
        self.file_df = df
        self.feature_dict = feature_dict

    def __len__(self):
        return len(self.file_df)

    def __getitem__(self, index):
        file, label = self.file_df.iloc[index,:]
        features = torch.cat([torch.tensor(self.feature_dict[file], dtype=torch.float64), torch.zeros((max-self.feature_dict[file].shape[0], 13), dtype=torch.float64)], dim=0).float()
        return features, torch.tensor(label)


Batch size is taken as 32

The train and valid dataloader are created

In [0]:
batch_size = 32
train_ds = SpeechDataset(train_file_df, train_feature_dict_normalised)
valid_ds = SpeechDataset(valid_file_df, valid_feature_dict_normalised)
train_loader = DataLoader(train_ds, batch_size=batch_size, drop_last=True, shuffle=True)
valid_loader = DataLoader(valid_ds, batch_size=batch_size, drop_last=True, shuffle=True)

# Model

The input features are passed through stacked LSTM of 8 layers.
The first LSTM layer is 13x64, the rest 7 are 64x64.

Then at the end of the sequence all the hidden layers of LSTM (8 * 64) are fed into the fully connected Linear layers.

In [0]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(13, 64, 8, batch_first=True)
        self.relu = nn.ReLU()
        self.linear_layers = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 5),
        )
        self.hidden = (torch.zeros(8, batch_size, 64).float().cuda(), torch.zeros(8, batch_size, 64).float().cuda())

    def forward(self, inputs):
        _, hidden = self.lstm(inputs, self.hidden)
        hid = (hidden[0].clone().detach().permute(1, 0, 2)).flatten(start_dim=1)
        output = self.linear_layers(hid)
        return output

model = Net()
model.cuda()
# model

Net(
  (lstm): LSTM(13, 64, num_layers=8, batch_first=True)
  (relu): ReLU()
  (linear_layers): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): ReLU()
    (6): Linear(in_features=64, out_features=32, bias=True)
    (7): ReLU()
    (8): Linear(in_features=32, out_features=16, bias=True)
    (9): ReLU()
    (10): Linear(in_features=16, out_features=5, bias=True)
  )
)

# Training

The cross entroy loss is chosen as the loss function

Stochastic Gradient Descent optimiser with learning rate 0.001 and momentum 0.8 is chosen

In [0]:
num_epochs = 5
loss_fn = nn.CrossEntropyLoss()
optimiser = optim.SGD(model.parameters(), lr = 0.001, momentum=0.8)

In [0]:
for epoch in range(num_epochs):
    print("*" * 100)
    print("Epoch ", epoch+1)

    train_loss = 0.0
    train_total = 0
    train_correct = 0
    model.train()
    with tqdm_notebook(total=len(train_loader)) as progress_bar:
      for features, label in iter(train_loader):
          features, label = features.cuda(), label.cuda()
          optimiser.zero_grad()
          output = model(features)
          train_correct += (output.argmax(1) == label).float().sum().item()
          train_total += features.shape[0]
          loss = loss_fn(output, label)
          loss.backward()
          train_loss += loss.item()*batch_size
          optimiser.step()
          progress_bar.update(1)
          print('\rloss : ', train_loss/train_total, " accuracy : ", train_correct/train_total*100, end = "")

    print("\rtraining accuracy : ", train_correct/train_total*100, "%")
    print("training loss : ", train_loss/train_total)
    print('\n\n')

    valid_loss = 0.0
    valid_total = 0
    valid_correct = 0
    model.eval()
    with tqdm_notebook(total=len(valid_loader)) as progress_bar:
      for features, label in iter(valid_loader):
          features, label = features.cuda(), label.cuda()
          output = model(features)
          valid_correct += (output.argmax(1) == label).float().sum().item()
          valid_total += features.shape[0]
          loss = loss_fn(output, label)
          valid_loss += loss.item()*batch_size
          progress_bar.update(1)

    print("\rvalidation accuracy : ", valid_correct/valid_total*100, "%")
    print("validation loss : ", valid_loss/valid_total)
    print('\n\n')

****************************************************************************************************
Epoch  1


HBox(children=(IntProgress(value=0, max=229), HTML(value='')))

loss :  1.3670663130855978  accuracy :  56.18176855895196
training accuracy :  56.18176855895196 %
training loss :  1.3670663130855978





HBox(children=(IntProgress(value=0, max=25), HTML(value='')))


validation accuracy :  62.125 %
validation loss :  1.2597495317459106



****************************************************************************************************
Epoch  2


HBox(children=(IntProgress(value=0, max=229), HTML(value='')))

loss :  1.188852322674214  accuracy :  62.45906113537117
training accuracy :  62.45906113537117 %
training loss :  1.188852322674214





HBox(children=(IntProgress(value=0, max=25), HTML(value='')))


validation accuracy :  62.25000000000001 %
validation loss :  1.1427534413337708



****************************************************************************************************
Epoch  3


HBox(children=(IntProgress(value=0, max=229), HTML(value='')))

loss :  1.112812109909724  accuracy :  62.472707423580786
training accuracy :  62.472707423580786 %
training loss :  1.112812109909724





HBox(children=(IntProgress(value=0, max=25), HTML(value='')))


validation accuracy :  62.625 %
validation loss :  1.0981243395805358



****************************************************************************************************
Epoch  4


HBox(children=(IntProgress(value=0, max=229), HTML(value='')))

loss :  1.0870932533230844  accuracy :  62.41812227074236
training accuracy :  62.41812227074236 %
training loss :  1.0870932533230844





HBox(children=(IntProgress(value=0, max=25), HTML(value='')))


validation accuracy :  62.5 %
validation loss :  1.0825830602645874



****************************************************************************************************
Epoch  5


HBox(children=(IntProgress(value=0, max=229), HTML(value='')))

loss :  1.076166417661192  accuracy :  62.472707423580786
training accuracy :  62.472707423580786 %
training loss :  1.076166417661192





HBox(children=(IntProgress(value=0, max=25), HTML(value='')))


validation accuracy :  63.0 %
validation loss :  1.0668337988853454





Saving the model

In [0]:
torch.save(model.state_dict(), 'mffcc-features-normalised-momentum.pth')