In [1]:
from cv2 import transform
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
from einops import rearrange
import torch.nn.functional as F
from torchinfo import summary
from tqdm.auto import tqdm
from einops import rearrange

## LFLB (N Filter)

Speech emotion recognition using deep 1d & 2d cnn lstm networks. Biomedical Signal Processing and Control 47, 312–323 (2019)

In [2]:
class LFLB(nn.Module):
    """Some Information about LFLB"""
    def __init__(self, in_chan, out_chan, kernel=[2, 2], stride=[1, 1], pad=1):
        super(LFLB, self).__init__()

        self.cnn = nn.Conv2d(in_channels=in_chan, out_channels=out_chan, kernel_size=kernel, stride=stride, padding=pad)
        self.bn = nn.BatchNorm2d(out_chan)
        self.relu = nn.ReLU()
        self.max_pool =  nn.MaxPool2d(kernel_size=kernel, stride=[2, 2])

    def forward(self, x):
        cnn_embd = self.cnn(x)
        cnn_embd = self.bn(cnn_embd)
        cnn_embd = self.relu(cnn_embd)
        output = self.max_pool(cnn_embd)

        return output

## NAC Unit

[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun, Identity Mappings in Deep Residual Networks. arXiv:1603.05027
https://github.com/kuangliu/pytorch-cifar/blob/master/models/preact_resnet.py

In [3]:
class NACUnit(nn.Module):
    '''Pre-activation version of the BasicBlock.'''
    expansion = 1

    def __init__(self, in_planes, planes, kernel, stride=1, pad=0):
        super(NACUnit, self).__init__()
        
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=kernel, stride=stride, padding=pad, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=kernel, stride=1, padding=1, bias=False)

    def forward(self, x):
        out = F.relu(self.bn1(x))
        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
        out = self.conv1(out)
        out = self.conv2(F.relu(self.bn2(out)))
        return out

In [4]:
class ResLFLB(nn.Module):
    def __init__(self, in_chan, n_filters, kernel):
        super(ResLFLB, self).__init__()
        
        self.lflb = LFLB(in_chan=in_chan, out_chan=n_filters)
        
        self.nac_unit1 = NACUnit(in_planes=n_filters, planes=n_filters, kernel=1, stride=1, pad=1)
        self.nac_unit2 = NACUnit(in_planes=n_filters, planes=n_filters, kernel=kernel, stride=1)
        self.nac_unit3 = NACUnit(in_planes=n_filters, planes=n_filters, kernel=kernel, stride=1)
        self.nac_unit4 = NACUnit(in_planes=n_filters, planes=n_filters, kernel=kernel, stride=1)
        self.nac_unit5 = NACUnit(in_planes=n_filters, planes=n_filters, kernel=1, stride=1)

    def forward(self, x):
        lflb_embd = self.lflb(x)
        
        nac_embd = self.nac_unit1(lflb_embd)
        nac_embd = self.nac_unit2(nac_embd)
        nac_embd = self.nac_unit3(nac_embd)
        nac_embd = self.nac_unit4(nac_embd)
        nac_embd = self.nac_unit5(nac_embd)
        
        output = lflb_embd + nac_embd

        return output

In [5]:
class Model(nn.Module):
    def __init__(self, num_emotions):
        super(Model, self).__init__()

        self.MFL = LFLB(3, 32)
        
        self.res_lflb1 = ResLFLB(32, 32, [3, 3])
        self.res_lflb2 = ResLFLB(32, 32, [3, 3])
        self.res_lflb3 = ResLFLB(32, 32, [3, 3])
        self.res_lflb4 = ResLFLB(32, 64, [3, 3])
        self.res_lflb5 = ResLFLB(64, 128, [3, 3])
        
        self.relu = nn.ReLU()
        self.max_pool = nn.MaxPool2d(kernel_size=[2, 2], stride=[2, 2])
        self.dropout = nn.Dropout(0.25)
        self.linear = nn.Linear(512, num_emotions)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        MFL_embd = self.MFL(x)
        
        res_lflb_embd = self.res_lflb1(MFL_embd)
        res_lflb_embd = self.res_lflb2(res_lflb_embd)
        res_lflb_embd = self.res_lflb3(res_lflb_embd)
        res_lflb_embd = self.res_lflb4(res_lflb_embd)
        res_lflb_embd = self.res_lflb5(res_lflb_embd)
        
        mlp = self.relu(res_lflb_embd)
        mlp = self.max_pool(mlp)
        mlp = self.dropout(mlp)
        mlp = torch.flatten(mlp, start_dim=1)
        output_logits = self.linear(mlp)
        output_softmax = self.softmax(output_logits)
        
        return output_logits, output_softmax

In [6]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


emotions_dict = {
    '0': 'surprised',
    '1': 'neutral',
    '2': 'calm',
    '3': 'happy',
    '4': 'sad',
    '5': 'angry',
    '6': 'fearful',
    '7': 'disgust'
}


sub_name = 'ADCRNN'
data_name = 'RAVDESS'
EARLY_CNT = 500

minibatch = 32
num_epochs = 500

In [7]:
def make_train_step(model, criterion, optimizer):

    # define the training step of the training phase
    def train_step(X, Y):

        # forward pass
        output_logits, output_softmax = model(X)
        predictions = torch.argmax(output_softmax, dim=1)
        accuracy = torch.sum(Y == predictions)/float(len(Y))

        # compute loss on logits because nn.CrossEntropyLoss implements log softmax
        loss = criterion(output_logits, Y)

        # compute gradients for the optimizer to use
        loss.backward()

        # update network parameters based on gradient stored (by calling loss.backward())
        optimizer.step()

        # zero out gradients for next pass
        # pytorch accumulates gradients from backwards passes (convenient for RNNs)
        optimizer.zero_grad()

        return loss.item(), accuracy*100
    return train_step

In [8]:
def make_validate_fnc(model, criterion):
    def validate(X, Y):

        # don't want to update any network parameters on validation passes: don't need gradient
        # wrap in torch.no_grad to save memory and compute in validation phase:
        with torch.no_grad():

            # set model to validation phase i.e. turn off dropout and batchnorm layers
            model.eval()
            
            # get the model's predictions on the validation set
            output_logits, output_softmax = model(X)
            predictions = torch.argmax(output_softmax, dim=1)

            # calculate the mean accuracy over the entire validation set
            accuracy = torch.sum(Y == predictions)/float(len(Y))

            # compute error from logits (nn.crossentropy implements softmax)
            loss = criterion(output_logits, Y)

        return loss.item(), accuracy*100, predictions
    return validate

In [9]:
def make_save_checkpoint():
    def save_checkpoint(optimizer, model, epoch, filename):
        checkpoint_dict = {
            'optimizer': optimizer.state_dict(),
            'model': model.state_dict(),
            'epoch': epoch
        }
        torch.save(checkpoint_dict, filename)
    return save_checkpoint


def load_checkpoint(optimizer, model, filename):
    checkpoint_dict = torch.load(filename)
    epoch = checkpoint_dict['epoch']
    model.load_state_dict(checkpoint_dict['model'])
    if optimizer is not None:
        optimizer.load_state_dict(checkpoint_dict['optimizer'])
    return epoch

In [10]:
def criterion(predictions, targets):
    return nn.CrossEntropyLoss()(input=predictions, target=targets)

In [11]:
from einops import rearrange

def train(model, num_epochs, X_train, Y_train, X_valid, Y_valid, pkl_name, fold, optimizer):
    
    # instantiate lists to hold scalar performance metrics to plot later
    train_losses = []
    valid_losses = []

    # get training set size to calculate # iterations and minibatch indices
    train_size = X_train.shape[0]

    # instantiate the checkpoint save function
    save_checkpoint = make_save_checkpoint()

    # instantiate the training step function
    train_step = make_train_step(model, criterion, optimizer=optimizer)
    # instantiate the validation loop function
    validate = make_validate_fnc(model, criterion)

    cur_valid_loss = 0x3f3f3f
    early_stop_cnt = 0
    
    for epoch in tqdm(range(num_epochs), desc=f'{fold}'):

        if early_stop_cnt >= EARLY_CNT:
            break
        
        # set model to train phase
        model.train()

        # shuffle entire training set in each epoch to randomize minibatch order
        train_indices = np.random.permutation(train_size)

        # shuffle the training set for each epoch:
        X_train = X_train[train_indices, :, :, :]
        Y_train = Y_train[train_indices]

        # instantiate scalar values to keep track of progress after each epoch so we can stop training when appropriate
        epoch_acc = 0
        epoch_loss = 0
        num_iterations = int(train_size / minibatch)

        # create a loop for each minibatch of 32 samples:
        for i in range(num_iterations):

            # we have to track and update minibatch position for the current minibatch
            # if we take a random batch position from a set, we almost certainly will skip some of the data in that set
            # track minibatch position based on iteration number:
            batch_start = i * minibatch
            # ensure we don't go out of the bounds of our training set:
            batch_end = min(batch_start + minibatch, train_size)
            # ensure we don't have an index error
            actual_batch_size = batch_end-batch_start

            # get training minibatch with all channnels and 2D feature dims
            X = X_train[batch_start:batch_end, :, :, :]
            # get training minibatch labels
            Y = Y_train[batch_start:batch_end]
            
            # instantiate training tensors
            X_tensor = torch.tensor(X, device=device).float()
            Y_tensor = torch.tensor(Y, dtype=torch.long, device=device)
            
            # Pass input tensors thru 1 training step (fwd+backwards pass)
            loss, acc = train_step(X_tensor, Y_tensor)

            # aggregate batch accuracy to measure progress of entire epoch
            epoch_acc += acc * actual_batch_size / train_size
            epoch_loss += loss * actual_batch_size / train_size

            # keep track of the iteration to see if the model's too slow
            print('\r'+f'Epoch {epoch}: iteration {i}/{num_iterations}', end='')

        X_valid_tensor = torch.tensor(X_valid, device=device).float()
        Y_valid_tensor = torch.tensor(Y_valid, dtype=torch.long, device=device)
        
        # calculate validation metrics to keep track of progress; don't need predictions now
        valid_loss, valid_acc, _ = validate(X_valid_tensor, Y_valid_tensor)
        
        # accumulate scalar performance metrics at each epoch to track and plot later
        train_losses.append(epoch_loss)
        valid_losses.append(valid_loss)
        
        if cur_valid_loss > valid_loss:
            save_checkpoint(optimizer, model, epoch, pkl_name)
            cur_valid_loss = valid_loss
        else:
            early_stop_cnt += 1

        # keep track of each epoch's progress
        print(f'\nEpoch {epoch} --- loss:{epoch_loss:.3f}, Epoch accuracy:{epoch_acc:.2f}%, Validation loss:{valid_loss:.3f}, Validation accuracy:{valid_acc:.2f}%')

        
    print(f"\n\n[*] done !")
    print(f'[*] Best training loss - {min(train_losses)}')
    print(f'[*] Best validation loss - {min(valid_losses)}')
    
    return train_losses, valid_losses

In [12]:
import csv
import numpy as np
from PIL import Image
from skimage.io import imread
from skimage.transform import resize

def get_fold_dataset(npy_name):
    with open(npy_name, 'rb') as f:
        X_train = np.load(f)
        y_train = np.load(f)
        X_test = np.load(f)
        y_test = np.load(f)
    
    return X_train, y_train, X_test, y_test

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd

train_losses, valid_losses = [], []

print(f'[*] model training - {sub_name}')

fold_path = '5-CV'

for fold in range(5):
    cur_fold_path = os.path.join('5-CV', 'fold'+str(fold)+'.npy')
    X_train, y_train, X_test, y_test = get_fold_dataset(cur_fold_path)
    
    pkl_name = f'./model/{data_name}-{sub_name}-{fold}.pkl'
    model = Model(num_emotions=len(emotions_dict)).to('cuda')
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-6, betas=(0.9, 0.999), eps=1e-08, weight_decay=0., amsgrad=True)
    
    train_losses, valid_losses = train(model, num_epochs, X_train, y_train, X_test, y_test, pkl_name, fold, optimizer)
    
    plt.title('Loss Curve')
    plt.ylabel('Loss', fontsize=16)
    plt.xlabel('Epoch', fontsize=16)
    plt.plot(train_losses[:],'b')
    plt.plot(valid_losses[:],'r')
    plt.legend(['Training loss','Validation loss'])
    plt.show()

[*] model training - ADCRNN


0:   0%|          | 0/500 [00:00<?, ?it/s]

Epoch 0: iteration 34/35
Epoch 0 --- loss:2.349, Epoch accuracy:14.12%, Validation loss:2.105, Validation accuracy:13.00%
Epoch 1: iteration 34/35
Epoch 1 --- loss:2.360, Epoch accuracy:11.67%, Validation loss:2.223, Validation accuracy:14.00%
Epoch 2: iteration 34/35
Epoch 2 --- loss:2.324, Epoch accuracy:12.89%, Validation loss:2.207, Validation accuracy:13.33%
Epoch 3: iteration 34/35
Epoch 3 --- loss:2.294, Epoch accuracy:13.33%, Validation loss:2.194, Validation accuracy:13.67%
Epoch 4: iteration 34/35
Epoch 4 --- loss:2.310, Epoch accuracy:11.14%, Validation loss:2.191, Validation accuracy:14.00%
Epoch 5: iteration 34/35
Epoch 5 --- loss:2.345, Epoch accuracy:13.16%, Validation loss:2.185, Validation accuracy:13.33%
Epoch 6: iteration 34/35
Epoch 6 --- loss:2.282, Epoch accuracy:14.56%, Validation loss:2.182, Validation accuracy:14.00%
Epoch 7: iteration 34/35
Epoch 7 --- loss:2.266, Epoch accuracy:14.30%, Validation loss:2.179, Validation accuracy:15.67%
Epoch 8: iteration 34/35

Epoch 67: iteration 34/35
Epoch 67 --- loss:2.128, Epoch accuracy:17.63%, Validation loss:2.060, Validation accuracy:18.00%
Epoch 68: iteration 34/35
Epoch 68 --- loss:2.148, Epoch accuracy:15.88%, Validation loss:2.063, Validation accuracy:18.33%
Epoch 69: iteration 34/35
Epoch 69 --- loss:2.122, Epoch accuracy:17.11%, Validation loss:2.065, Validation accuracy:20.00%
Epoch 70: iteration 34/35
Epoch 70 --- loss:2.122, Epoch accuracy:16.75%, Validation loss:2.063, Validation accuracy:16.33%
Epoch 71: iteration 34/35
Epoch 71 --- loss:2.142, Epoch accuracy:16.14%, Validation loss:2.053, Validation accuracy:18.67%
Epoch 72: iteration 34/35
Epoch 72 --- loss:2.106, Epoch accuracy:18.16%, Validation loss:2.060, Validation accuracy:16.67%
Epoch 73: iteration 34/35
Epoch 73 --- loss:2.158, Epoch accuracy:15.00%, Validation loss:2.068, Validation accuracy:15.67%
Epoch 74: iteration 34/35
Epoch 74 --- loss:2.143, Epoch accuracy:16.05%, Validation loss:2.064, Validation accuracy:15.00%
Epoch 75

Epoch 133: iteration 34/35
Epoch 133 --- loss:2.086, Epoch accuracy:17.54%, Validation loss:2.027, Validation accuracy:18.67%
Epoch 134: iteration 34/35
Epoch 134 --- loss:2.062, Epoch accuracy:19.56%, Validation loss:2.009, Validation accuracy:22.67%
Epoch 135: iteration 34/35
Epoch 135 --- loss:2.088, Epoch accuracy:17.89%, Validation loss:2.008, Validation accuracy:21.67%
Epoch 136: iteration 34/35
Epoch 136 --- loss:2.048, Epoch accuracy:20.26%, Validation loss:2.029, Validation accuracy:21.33%
Epoch 137: iteration 34/35
Epoch 137 --- loss:2.044, Epoch accuracy:19.91%, Validation loss:2.015, Validation accuracy:20.67%
Epoch 138: iteration 34/35
Epoch 138 --- loss:2.073, Epoch accuracy:19.47%, Validation loss:2.017, Validation accuracy:21.33%
Epoch 139: iteration 34/35
Epoch 139 --- loss:2.049, Epoch accuracy:18.86%, Validation loss:2.011, Validation accuracy:19.67%
Epoch 140: iteration 34/35
Epoch 140 --- loss:2.075, Epoch accuracy:19.82%, Validation loss:2.011, Validation accuracy

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd

device = 'cuda'
model = Model(num_emotions=len(emotions_dict)).to('cuda')

test_accs = []

for fold in range(5):
    pkl_name = f'./model/{data_name}-{sub_name}-{fold}.pkl'
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-6, betas=(0.9, 0.999), eps=1e-08, weight_decay=0., amsgrad=True)
    
    load_checkpoint(optimizer, model, pkl_name)
    
    validate = make_validate_fnc(model, criterion)
    
    cur_fold_path = os.path.join('5-CV', 'fold'+str(fold)+'.npy')
    X_train, y_train, X_test, y_test = get_fold_dataset(cur_fold_path)
    
    X_test_tensor = torch.tensor(X_test,device=device).float()
    X_test_tensor = X_test_tensor.permute(0, 3, 1, 2)
    y_test_tensor = torch.tensor(y_test,dtype=torch.long,device=device)
    
    test_loss, test_acc, predicted_emotions = validate(X_test_tensor, y_test_tensor)
    print(f'[*] {fold} Test accuracy is {test_acc:.2f}%')
    test_accs.append(test_acc)
    
    predicted_emotions = predicted_emotions.cpu().numpy()
    # use labels from test set
    emotions_groundtruth = y_test_tensor.cpu().numpy()
    
    # build confusion matrix and normalized confusion matrix
    conf_matrix = confusion_matrix(emotions_groundtruth, predicted_emotions)
    conf_matrix_norm = confusion_matrix(emotions_groundtruth, predicted_emotions,normalize='true')

    # set labels for matrix axes from emotions
    emotion_names = [emotion for emotion in emotions_dict.values()]

    # make a confusion matrix with labels using a DataFrame
    confmatrix_df = pd.DataFrame(conf_matrix, index=emotion_names, columns=emotion_names)
    confmatrix_df_norm = pd.DataFrame(conf_matrix_norm, index=emotion_names, columns=emotion_names)

    # plot confusion matrices
    plt.figure(figsize=(16,6))
    sn.set(font_scale=1.8) # emotion label and title size
    plt.subplot(1,2,1)
    plt.title('Confusion Matrix')
    sn.heatmap(confmatrix_df, annot=True, annot_kws={"size": 18}) #annot_kws is value font
    plt.subplot(1,2,2)
    plt.title('Normalized Confusion Matrix')
    sn.heatmap(confmatrix_df_norm, annot=True, annot_kws={"size": 13}) #annot_kws is value font

    plt.show()

In [None]:
def Average(lst):
    return sum(lst) / len(lst)

print('[*] K-Fold training done !')
print(f'[*] Best fold acc : {max(test_accs):.2f}%')
print(f'[*] Worst fold acc : {min(test_accs):.2f}%')
print(f'[*] Average fold acc : {Average(test_accs):.2f}%')

In [None]:
from sklearn.metrics import classification_report

device = torch.device('cpu')
model = model.to(device)

test_accs = []

for fold in range(5):
    pkl_name = f'./model/{data_name}-{sub_name}-{fold}.pkl'
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-6, betas=(0.9, 0.999), eps=1e-08, weight_decay=0., amsgrad=True)
    
    load_checkpoint(optimizer, model, pkl_name)
    
    validate = make_validate_fnc(model, criterion)
    
    cur_fold_path = os.path.join('5-CV', 'fold'+str(fold)+'.npy')
    X_train, y_train, X_test, y_test = get_fold_dataset(cur_fold_path)
    
    X_test_tensor = torch.tensor(X_test).float()
    X_test_tensor = X_test_tensor.permute(0, 3, 1, 2)
    y_test_tensor = torch.tensor(y_test, dtype=torch.long)
    
    test_loss, test_acc, predicted_emotions = validate(X_test_tensor, y_test_tensor)
    print(f'[*] {fold} Test accuracy is {test_acc:.2f}%')
    test_accs.append(test_acc)
    
    predicted_emotions = predicted_emotions.cpu().numpy()
    # use labels from test set
    emotions_groundtruth = y_test_tensor

    print(classification_report(emotions_groundtruth, predicted_emotions))