<a href="https://colab.research.google.com/github/dshoe17/Deep-SER/blob/master/SER_CNN_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
## Imports
import os, sys, glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm

import torch
import torchvision
from torchvision import transforms, datasets
import torch.nn as nn
import torch.nn.functional as F

In [0]:
## Mount personal Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
## Unzip resampled dataset (assuming zip file is located in the specified directory in Google Drive)
!unzip -q '/content/gdrive/My Drive/new_resamples.zip'

In [0]:
# Function to plot 3-dimensional torch tensors as images
def torch_img(tensor):
  to_pil = torchvision.transforms.ToPILImage()
  img = to_pil(tensor)
  plt.imshow(img)
  plt.show()

In [0]:
# Function to load custom dataset from image folder hierarchy

root = '/content/new_resamples/'

def load_dataset():
  train_dataset = torchvision.datasets.ImageFolder(
      root = root, 
      transform = torchvision.transforms.ToTensor()
  )
  train_loader = torch.utils.data.DataLoader(
      train_dataset,
      batch_size=64,
      num_workers=0,
      shuffle=True
  )
  return train_loader.dataset

In [0]:
# Function to split custom dataset into train and test sets
def get_split(loader, batch=30, prop=0.8):
  
  default_load = lambda data, batch=batch: torch.utils.data.DataLoader(
      data,
      batch_size=batch,
      num_workers=0,
      shuffle=False
  )

  train_ix = int(prop * len(loader))
  test_ix = len(loader) - train_ix
  train, test = torch.utils.data.random_split(loader, [train_ix, test_ix])
  return map(default_load, (train, test))

In [0]:
train_data, test_data = get_split(load_dataset(), batch=30)

In [0]:
## Net class for Log-Mel Spectrograms
class Net(nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    # LFLB Block 1
    self.conv1 = nn.Conv2d(3, 64, 3, padding=1)
    self.bn1 = nn.BatchNorm2d(64)
    self.pool1 = nn.MaxPool2d(2,2)
    
    # LFLB Block 2
    self.conv2 = nn.Conv2d(64, 64, 3, padding=1)
    self.bn2 = nn.BatchNorm2d(64)
    self.pool2 = nn.MaxPool2d(4,4)

    # LFLB Block 3
    self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
    self.bn3 = nn.BatchNorm2d(128)
    self.pool3 = nn.MaxPool2d(4,4)

    # LFLB Block 4
    self.conv4 = nn.Conv2d(128, 128, 3, padding=1)
    self.bn4 = nn.BatchNorm2d(128)
    self.pool4 = nn.MaxPool2d(4,4)

    # LSTM
    self.lstm = nn.LSTM(input_size=128, hidden_size=256)

    # FC
    self.fc = nn.Linear(256, 7)

  def forward(self,x):
    # LFLB Block 1
    x = self.bn1(self.conv1(x))
    x = self.pool1(F.elu(x))

    # LFLB Block 2
    x = self.bn2(self.conv2(x))
    x = self.pool2(F.elu(x))

    # LFLB Block 3
    x = self.bn3(self.conv3(x))
    x = self.pool3(F.elu(x))

    # LFLB Block 4
    x = self.bn4(self.conv4(x))
    x = self.pool4(F.elu(x))

    # LSTM
    x = x.view(x.size()[0], 128, -1)
    x = x.transpose(1,2)
    x = x.transpose(0,1).contiguous()
    x, _ = self.lstm(x)
    x = x.squeeze()

    # FC
    x = self.fc(x)

    return F.log_softmax(x, dim=1)

net = Net()
print(net)

Net(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool4): MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, 

In [0]:
# Enabling CUDA / GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

net.to(device)

# Resetting CUDA / GPU
# !/opt/bin/nvidia-smi
# !ps -aux|grep python
# !kill -9 #124

Net(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool3): MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn4): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool4): MaxPool2d(kernel_size=4, stride=4, padding=0, dilation=1, 

In [0]:
## Training block
import torch.optim as optim
optimizer = optim.Adam(net.parameters(), lr=0.01)

EPOCHS = 30  # Number of epochs to train for
n_batch = len(train_data)

## Ensures step size that decays by factor of 0.1 every 10 epochs
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=n_batch*10)

import time
start = time.time()

for epoch in range(EPOCHS):
  print('EPOCH:', epoch)
  for i, data in enumerate(train_data):
    print(i)
    X,y = data
    X,y = X.to(device), y.to(device)
    net.zero_grad()
    output = net(X)
    loss = F.nll_loss(output, y)
    loss.backward()
    optimizer.step()
    scheduler.step()
  delta = time.time() - start
  print(delta)
  print(loss)

EPOCH: 0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
7.9401092529296875
tensor(1.2228, device='cuda:0', grad_fn=<NllLossBackward>)
EPOCH: 1
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
15.911137580871582
tensor(1.0931, device='cuda:0', grad_fn=<NllLossBackward>)
EPOCH: 2
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
2

In [0]:
## Training Accuracy
correct = 0
total = 0

with torch.no_grad():
  for data in train_data:
    X,y = data
    X,y = X.to(device), y.to(device)  # CUDA / GPU transformation
    output = net(X)
    for idx, i in enumerate(output):
      if torch.argmax(i) == y[idx]:
        correct += 1
      total += 1
round(correct/total,3)

0.998

In [0]:
## Testing Accuracy
correct = 0
total = 0

with torch.no_grad():
  for data in test_data:
    X,y = data
    X,y = X.to(device), y.to(device)  # CUDA / GPU transformation
    output = net(X)
    for idx, i in enumerate(output):
      if torch.argmax(i) == y[idx]:
        correct += 1
      total += 1
round(correct/total,3)

0.874

In [0]:
## Train Data
# with torch.no_grad():
#   train_pairs = [(torch.argmax(i).item(), y[idx].item()) for X,y in train_data for idx,i in enumerate(net(X))]
#   preds,targets = zip(*train_pairs)

# CUDA / GPU version
with torch.no_grad():
  train_pairs = [(torch.argmax(i).item(), y.to(device)[idx].item()) for X,y in train_data for idx,i in enumerate(net(X.to(device)))]
  preds,targets = zip(*train_pairs)

In [0]:
## Test Data
# with torch.no_grad():
#   train_pairs = [(torch.argmax(i).item(), y[idx].item()) for X,y in test_data for idx,i in enumerate(net(X))]
#   preds,targets = zip(*train_pairs)

# CUDA / GPU version
with torch.no_grad():
  train_pairs = [(torch.argmax(i).item(), y.to(device)[idx].item()) for X,y in test_data for idx,i in enumerate(net(X.to(device)))]
  preds,targets = zip(*train_pairs)

In [0]:
## Confusion matrix for results
from sklearn.metrics import confusion_matrix

confusion_matrix(preds, targets)

array([[ 83,   0,   1,   6,   7,   0,   0],
       [  0,  77,   1,   0,   0,  18,   5],
       [  0,   6, 101,   0,   1,   1,   0],
       [  5,   1,   1,  85,   2,   0,   1],
       [  2,   0,   5,   7,  84,   1,   0],
       [  0,  10,   1,   1,   3,  86,   0],
       [  0,   1,   0,   1,   0,   1, 100]])