In [19]:
import pickle
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch
import numpy as np
from model import SubNet, TextSubNet, TFN
#from datasets import MultimodalDataset

In [20]:
def get_cmu_mosi_dataset(path='../../dataset/cmu-mosi/mosi_20_seq_data.pkl'):
    
    file = open(path, 'rb')
    data = pickle.load(file)
    file.close()
    
    # features: (batch_size, seq_length, input_size)
    # audio and vision features are averaged across time to (batch_size, input_size)
    # labels: (batch_size, 1)
    text = torch.tensor(data['train']['text'], dtype=torch.float32)
    audio = torch.tensor(data['train']['audio'], dtype=torch.float32).mean(dim=1)
    vision = torch.tensor(data['train']['vision'], dtype=torch.float32).mean(dim=1)
    labels = torch.tensor(data['train']['labels'], dtype=torch.float32).squeeze(1)
    train_set = MultimodalDataset(text, audio, vision, labels)

    text = torch.tensor(data['valid']['text'], dtype=torch.float32)
    audio = torch.tensor(data['valid']['audio'], dtype=torch.float32).mean(dim=1)
    vision = torch.tensor(data['valid']['vision'], dtype=torch.float32).mean(dim=1)
    labels = torch.tensor(data['valid']['labels'], dtype=torch.float32).squeeze(1)
    valid_set = MultimodalDataset(text, audio, vision, labels)
    
    text = torch.tensor(data['test']['text'], dtype=torch.float32)
    audio = torch.tensor(data['test']['audio'], dtype=torch.float32).mean(dim=1)
    vision = torch.tensor(data['test']['vision'], dtype=torch.float32).mean(dim=1)
    labels = torch.tensor(data['test']['labels'], dtype=torch.float32).squeeze(1)
    test_set = MultimodalDataset(text, audio, vision, labels)
    
    return train_set, valid_set, test_set

In [21]:
class MultimodalDataset(Dataset):
    '''
    Dataset for CMU-MOSI
    '''
    def __init__(self, text, audio, vision, labels):
        '''
        args:
            text: text modality feature of shape (N, seq. length, text_input_size)
            audio: audio modality feature of shape (N, seq. length, audio_input_size)
            vision: vision modality feature of shape (N, seq. length, vision_input_size)
            labels: labels of shape (N, 1) and ranges from -3 to 3
        '''
        self.text = text
        self.audio = audio
        self.vision = vision
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        '''
        Returns an individual data composed of (text, audio, vision, label)

        Returns:
            text: text modality feature of shape (seq. length, text_input_size)
            audio: audio modality feature of shape (audio_input_size)
            vision: vision modality feature of shape (vision_input_size)
            label: a scalar label that ranges from -3 to 3
        '''
        text = self.text[idx]
        audio = self.audio[idx]
        vision = self.vision[idx]
        label = self.labels[idx]
        return text, audio, vision, label

In [22]:
train_set, valid_set, test_set = get_cmu_mosi_dataset()

In [23]:
batch_size = 32
epochs = 50
# settings from https://github.com/Justin1904/TensorFusionNetworks/blob/master/train.py
input_dims = (5, 20, 300)
hidden_dims = (4, 16, 128)
text_out = 64
dropouts = (0.3, 0.3, 0.3, 0.3)
post_fusion_dim = 32
model = TFN(input_dims, hidden_dims, text_out, dropouts, post_fusion_dim)

In [24]:
train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_set, batch_size=len(valid_set))
test_dataloader = DataLoader(test_set, batch_size=len(test_set))

In [25]:
criterion = nn.L1Loss()
optimizer = optim.Adam(list(model.parameters())[2:])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, verbose=True)
for e in range(epochs):
    print('Epoch {}'.format(e))
    train_loss = 0.0
    model.train()
    for text, audio, vision, label in train_dataloader:
        model.zero_grad()
        output = model(audio, vision, text)
        loss = criterion(output, label)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print(train_loss)
    
    model.eval()
    for text, audio, vision, label in valid_dataloader:
        output = model(audio, vision, text)
        valid_loss = criterion(output, label).item()
    scheduler.step(valid_loss)
    print(valid_loss)

Epoch 0
52.85571324825287
1.2533843517303467
Epoch 1
42.93867760896683
1.1115158796310425
Epoch 2
37.203013837337494
1.0741556882858276
Epoch 3
35.8412299156189
1.0383533239364624
Epoch 4
32.361928820610046
1.1015052795410156
Epoch 5
34.682821810245514
1.039508581161499
Epoch 6
27.491903245449066
0.9781687259674072
Epoch 7
26.280299723148346
0.9494391679763794
Epoch 8
25.401515871286392
0.9347696900367737
Epoch 9
24.185295909643173
0.9859669208526611
Epoch 10
23.77444839477539
0.9533861875534058
Epoch 11
22.39775425195694
Epoch    12: reducing learning rate of group 0 to 1.0000e-04.
0.9566575884819031
Epoch 12
19.511213064193726
0.9454945921897888
Epoch 13
18.544819474220276
0.9480131268501282
Epoch 14
19.425233066082
Epoch    15: reducing learning rate of group 0 to 1.0000e-05.
0.9447203874588013
Epoch 15
18.77895399928093
0.9439171552658081
Epoch 16
18.106388211250305
0.9436770677566528
Epoch 17
18.676059514284134
Epoch    18: reducing learning rate of group 0 to 1.0000e-06.
0.943499

In [17]:
def train_cmu_mosi(batch_size=32, epochs=100, lr=.001, max_rank=5, rank_adaptive=False,  
                   warmup_epochs=50, kl_multiplier=1e-4, no_kl_epochs=5, accelerated=True):

    # load dataset file
    file = open('../../dataset/cmu-mosi/mosi_20_seq_data.pkl', 'rb')
    data = pickle.load(file)
    file.close()

    # prepare the datasets and data loaders
    train_set = MultimodalDataset(data['train']['text'], data['train']['audio'],
                                  data['train']['vision'], data['train']['labels'])
    valid_set = MultimodalDataset(data['valid']['text'], data['valid']['audio'],
                                  data['valid']['vision'], data['valid']['labels'])

    train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    valid_dataloader = DataLoader(valid_set, batch_size=len(valid_set))

    # set up model
    input_sizes = (train_set[0][0]['audio'].shape[0], train_set[0][0]['vision'].shape[0],
                   train_set[0][0]['text'].shape[1])
    hidden_sizes = (32, 32, 128)
    output_size = 1
    
    # model = CP_Tensor_Fusion_Network(input_sizes, hidden_sizes, output_size, max_rank,
    #                                  rank_adaptive)
    
    model = TFN(input_dims=input_sizes, hidden_dims=hidden_sizes, text_out=128, dropouts=(0.3, 0.3, 0.3, 0.3), 
                post_fusion_dim=1)
    
    # set up training
    DTYPE = torch.FloatTensor
    optimizer = optim.Adam(list(model.parameters())[2:], lr=lr)
    #criterion = nn.MSELoss()
    criterion = nn.L1Loss()
    
    # train and validate
    for e in range(1, epochs + 1):
        # train
        #tic = time.time()
        model.train()
        train_loss = 0.0
        for batch in train_dataloader:
            model.zero_grad()

            features, label = batch
            
            x_a = Variable(features['audio'].float().type(DTYPE), requires_grad=False)
            x_v = Variable(features['vision'].float().type(DTYPE), requires_grad=False)
            x_t = Variable(features['text'].float().type(DTYPE), requires_grad=False)
            y = Variable(label.view(-1, 1).float().type(DTYPE), requires_grad=False)
            
            output = model(x_a, x_v, x_t)
            loss = criterion(output, y)
            
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            
        
        print('Train Loss {:.3f}'.format(train_loss))
        '''
        print(model.audio_subnet.linear_2.weight.grad.mean())
        print(model.video_subnet.linear_2.weight.grad.mean())
        print(model.text_subnet.linear_1.weight.grad.mean())
        print(model.tensor_fusion_layer.weight_tensor.factors[0].grad.mean())
        '''

        # validate
        model.eval()
        for batch in valid_dataloader:
            features, label = batch
            x_a = Variable(features['audio'].float().type(DTYPE), requires_grad=False)
            x_v = Variable(features['vision'].float().type(DTYPE), requires_grad=False)
            x_t = Variable(features['text'].float().type(DTYPE), requires_grad=False)
            y = Variable(label.view(-1, 1).float().type(DTYPE), requires_grad=False)

            output = model(x_a, x_v, x_t)
        
        valid_mse = nn.functional.l1_loss(output, y).item()
        print("Valid MSE {:.3f}".format(valid_mse))

In [18]:
train_cmu_mosi()

Train Loss 56.048
Valid MSE 1.432
Train Loss 55.095
Valid MSE 1.426
Train Loss 54.541
Valid MSE 1.421
Train Loss 54.881
Valid MSE 1.419
Train Loss 54.105
Valid MSE 1.417
Train Loss 54.201
Valid MSE 1.416
Train Loss 54.426
Valid MSE 1.413
Train Loss 54.250
Valid MSE 1.413
Train Loss 53.920
Valid MSE 1.413
Train Loss 54.576
Valid MSE 1.413


KeyboardInterrupt: 