In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils as util
from sklearn.model_selection import KFold
import numpy as np
import os
import csv

# Data Preprocessing

In [2]:
data_labels, lexical_data, acoustic_data, visual_data = [], [], [], []
reader = csv.reader(open("dataset.csv"), delimiter=',')
next(reader)

for row in reader:
    visual_data.append(np.load(row[2][1:]))
    acoustic_data.append(np.load(row[3][1:]))
    lexical_data.append(np.load(row[4][1:]))
    data_labels.append(int(row[5]))

# Labels
data_labels = torch.from_numpy(np.asarray(data_labels))

 
# Lexical Data
lexical_data = torch.from_numpy(np.asarray(lexical_data))
lexical_dataset = util.data.TensorDataset(lexical_data, data_labels)

# Acoustic Data - first pad, then convert into tensor
acoustic_max = max([len(tens) for tens in acoustic_data])
for i, tens in enumerate(acoustic_data):
    dist = acoustic_max - len(tens)
    tens = np.pad(tens, pad_width=[(0,dist), (0,0)], mode='constant')
    acoustic_data[i] = tens
    
acoustic_data = torch.from_numpy(np.asarray(acoustic_data))
acoustic_dataset = util.data.TensorDataset(acoustic_data, data_labels)


# Visual Data - first pad, then convert into tensor
visual_max = max([len(tens) for tens in visual_data])
for i, tens in enumerate(visual_data):
    dist = visual_max - len(tens)
    tens = np.pad(tens, pad_width=[(0,dist), (0,0)], mode='constant')
    visual_data[i] = tens
    
visual_data = torch.from_numpy(np.asarray(visual_data)).type('torch.DoubleTensor') # need to explicitly cast as double
visual_dataset = util.data.TensorDataset(visual_data, data_labels)



In [52]:
print('Visual Data Shape: ' + str(visual_data.shape))
print('Acoustic Data Shape: ' +  str(acoustic_data.shape))
print('Lexical Data Shape: ' +  str(lexical_data.shape))
print('Labels Shape: ' +  str(data_labels.shape))

Visual Data Shape: torch.Size([1336, 716, 2048])
Acoustic Data Shape: torch.Size([1336, 24, 128])
Lexical Data Shape: torch.Size([1336, 768])
Labels Shape: torch.Size([1336])


# Lexical Model

## Model Definition

In [11]:
class LexicalModel(nn.Module):
    def __init__(self):
        super(LexicalModel, self).__init__()
        self.fc1 = nn.Linear(768, 230)
        self.batch1 = nn.BatchNorm1d(230)
        self.fc2 = nn.Linear(230, 50)
        self.batch2 = nn.BatchNorm1d(50)
        self.fc3 = nn.Linear(50, 4)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
      
    # batchnorm wrecks this for some reason
    def forward(self, x):
        x = self.fc1(x)
#         x = self.batch1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
#         x = self.batch2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

## Training and Testing Lexical Model

In [12]:
lex = LexicalModel()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lex.parameters(), lr=1e-4)

k_folds = 10
num_epochs = 15

k_fold_results = {}

kfold = KFold(n_splits=k_folds, shuffle=True)


for fold, (train_ids, test_ids) in enumerate(kfold.split(lexical_dataset)):
    print(f"Fold {fold}")
    print('----------------------------')
    
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
    lexical_trainloader = torch.utils.data.DataLoader(lexical_dataset, batch_size=10,
                                          sampler=train_subsampler)
    lexical_testloader = torch.utils.data.DataLoader(lexical_dataset, batch_size=10,
                                          sampler=test_subsampler)
    
    for epoch in range(num_epochs):  # loop over the dataset multiple times


        running_loss = 0.0
        for i, data in enumerate(lexical_trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = lex(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 100 == 99:    # print every 200 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 100))
                running_loss = 0.0

    print('Finished Training - Testing Begins')
    
    correct, total = 0,0
    
    with torch.no_grad():
        # Iterate over the test data and generate predictions
        for i, data in enumerate(lexical_testloader, 0):

            # Get inputs
            inputs, targets = data

            # Generate outputs
            outputs = lex(inputs)

            # Set total and correct
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
        # Print accuracy
        print('Accuracy for fold %d: %d %%' % (fold, 100.0 * correct / total))
        print('--------------------------------')
        k_fold_results[fold] = 100.0 * (correct / total)
    
    # Print fold results
    print(f'K-FOLD CROSS VALIDATION RESULTS FOR {k_folds} FOLDS')
    print('--------------------------------')
    sum = 0.0
    for key, value in k_fold_results.items():
        print(f'Fold {key}: {value} %')
        sum += value
    print(f'Average: {sum/len(k_fold_results.items())} %')

Fold 0
----------------------------
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
[1,   100] loss: 1.360
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
[2,   100] loss: 1.165
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59


KeyboardInterrupt: 

# Acoustic Model

In [24]:
class AcousticModel(nn.Module):
    def __init__(self):
        super(AcousticModel, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=24, kernel_size=1, out_channels=16)
        self.conv2 = nn.Conv1d(in_channels=16, kernel_size=1, out_channels=8)
#         self.pool = nn.MaxPool2d(2, 2)
#         self.conv2 = nn.Conv2d(6, 16, 5)
#         self.fc1 = nn.Linear(16 * 5 * 5, 120)
#         self.fc2 = nn.Linear(120, 84)
#         self.fc3 = nn.Linear(84, 10)
        self.flatten = nn.Flatten()
        
        self.fc1 = nn.Linear(1024, 128)
        self.batch1 = nn.BatchNorm1d(16)
        self.fc2 = nn.Linear(128, 32)
        self.batch2 = nn.BatchNorm1d(8)
        self.fc3 = nn.Linear(32, 4)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
      
    # batchnorm wrecks this for some reason
    def forward(self, x):
        x = self.conv1(x)
        x = self.batch1(x)
        x = self.relu(x)
        
        x = self.conv2(x)
        x = self.batch2(x)
        x = self.relu(x)
        
        x = self.flatten(x)
#         print(x.shape)
        x = self.fc1(x)
#         print(x.shape)
#         x = self.batch1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
#         x = self.batch2(x)
        x = self.relu(x)
        x = self.fc3(x)
#         print(x.shape)
        return x

## Training Acoustic Model

In [74]:
acoust = AcousticModel()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(acoust.parameters(), lr=1e-4)

k_folds = 10
num_epochs = 15

k_fold_results = {}

kfold = KFold(n_splits=k_folds, shuffle=True)


for fold, (train_ids, test_ids) in enumerate(kfold.split(acoustic_dataset)):
    print(f"Fold {fold}")
    print('----------------------------')
    
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
    acoustic_trainloader = torch.utils.data.DataLoader(acoustic_dataset, batch_size=10,
                                          sampler=train_subsampler)
    acoustic_testloader = torch.utils.data.DataLoader(acoustic_dataset, batch_size=10,
                                          sampler=test_subsampler)
    
    for epoch in range(num_epochs):  # loop over the dataset multiple times


        running_loss = 0.0
        for i, data in enumerate(acoustic_trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = acoust(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 100 == 99:    # print every 200 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 100))
                running_loss = 0.0

    print('Finished Training - Testing Begins')
    
    correct, total = 0,0
    
    with torch.no_grad():
        # Iterate over the test data and generate predictions
        for i, data in enumerate(acoustic_testloader, 0):

            # Get inputs
            inputs, targets = data

            # Generate outputs
            outputs = acoust(inputs)

            # Set total and correct
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
        # Print accuracy
        print('Accuracy for fold %d: %d %%' % (fold, 100.0 * correct / total))
        print('--------------------------------')
        k_fold_results[fold] = 100.0 * (correct / total)
    
    # Print fold results
    print(f'K-FOLD CROSS VALIDATION RESULTS FOR {k_folds} FOLDS')
    print('--------------------------------')
    sum = 0.0
    for key, value in k_fold_results.items():
        print(f'Fold {key}: {value} %')
        sum += value
    print(f'Average: {sum/len(k_fold_results.items())} %')

Fold 0
----------------------------
[1,   100] loss: 1.355
[2,   100] loss: 1.276
[3,   100] loss: 1.206
[4,   100] loss: 1.152
[5,   100] loss: 1.102
[6,   100] loss: 1.066
[7,   100] loss: 1.032


KeyboardInterrupt: 

# Visual Model

In [22]:
class VisualModel(nn.Module):
    def __init__(self):
        super(VisualModel, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=716, kernel_size=1, out_channels=8).double()
        self.batch1 = nn.BatchNorm1d(8)
        self.conv2 = nn.Conv1d(in_channels=8, kernel_size=1, out_channels=1).double()
        self.batch2 = nn.BatchNorm1d(1)

        self.flatten = nn.Flatten()
        
        self.fc1 = nn.Linear(2048, 128)

        self.fc2 = nn.Linear(128, 16)
        
        self.fc3 = nn.Linear(16, 4)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
      
    # batchnorm wrecks this for some reason
    def forward(self, x):
        x = self.conv1(x)
        x = self.batch1(x)
        x = self.relu(x)
        
        x = self.conv2(x)
        x = self.batch2(x)
        x = self.relu(x)
        
        x = self.flatten(x)
        
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.fc2(x)
        x = self.relu(x)
        
        x = self.fc3(x)
        return x

In [23]:
visual = VisualModel().double()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(visual.parameters(), lr=1e-4)

k_folds = 10
num_epochs = 15

k_fold_results = {}

kfold = KFold(n_splits=k_folds, shuffle=True)


for fold, (train_ids, test_ids) in enumerate(kfold.split(visual_dataset)):
    print(f"Fold {fold}")
    print('----------------------------')
    
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
    visual_trainloader = torch.utils.data.DataLoader(visual_dataset, batch_size=10,
                                          sampler=train_subsampler)
    visual_testloader = torch.utils.data.DataLoader(visual_dataset, batch_size=10,
                                          sampler=test_subsampler)
    
    for epoch in range(num_epochs):  # loop over the dataset multiple times


        running_loss = 0.0
        for i, data in enumerate(visual_trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = visual(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
#             print(i)
            
            if i % 100 == 99:    # print every 200 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 100))
                running_loss = 0.0

    print('Finished Training - Testing Begins')
    
    correct, total = 0,0
    
    with torch.no_grad():
        # Iterate over the test data and generate predictions
        for i, data in enumerate(visual_testloader, 0):

            # Get inputs
            inputs, targets = data

            # Generate outputs
            outputs = visual(inputs)

            # Set total and correct
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
        # Print accuracy
        print('Accuracy for fold %d: %d %%' % (fold, 100.0 * correct / total))
        print('--------------------------------')
        k_fold_results[fold] = 100.0 * (correct / total)
    
    # Print fold results
    print(f'K-FOLD CROSS VALIDATION RESULTS FOR {k_folds} FOLDS')
    print('--------------------------------')
    sum = 0.0
    for key, value in k_fold_results.items():
        print(f'Fold {key}: {value} %')
        sum += value
    print(f'Average: {sum/len(k_fold_results.items())} %')

Fold 0
----------------------------
[1,   100] loss: 1.374
[2,   100] loss: 1.324
[3,   100] loss: 1.295
[4,   100] loss: 1.268
[5,   100] loss: 1.217
[6,   100] loss: 1.147
[7,   100] loss: 1.103
[8,   100] loss: 0.981
[9,   100] loss: 0.873
[10,   100] loss: 0.768
[11,   100] loss: 0.670
[12,   100] loss: 0.569
[13,   100] loss: 0.472
[14,   100] loss: 0.396
[15,   100] loss: 0.324
Finished Training - Testing Begins
Accuracy for fold 0: 27 %
--------------------------------
K-FOLD CROSS VALIDATION RESULTS FOR 10 FOLDS
--------------------------------
Fold 0: 27.611940298507463 %
Average: 27.611940298507463 %
Fold 1
----------------------------
[1,   100] loss: 0.470
[2,   100] loss: 0.366
[3,   100] loss: 0.272
[4,   100] loss: 0.251
[5,   100] loss: 0.192
[6,   100] loss: 0.175
[7,   100] loss: 0.126
[8,   100] loss: 0.114
[9,   100] loss: 0.098
[10,   100] loss: 0.092
[11,   100] loss: 0.076
[12,   100] loss: 0.061
[13,   100] loss: 0.062
[14,   100] loss: 0.053
[15,   100] loss: 0

KeyboardInterrupt: 