In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch.autograd import Variable
from torch.utils.data import DataLoader

import numpy as np

import pickle

from datasets import Multimodal_Binary_Dataset
from fusion_model import CP_Tensor_Fusion_Network

import time

In [2]:
def train_cmu_mosi(batch_size=32, epochs=100, lr=.001, max_rank=20, rank_adaptive=True,  
                   warmup_epochs=50, kl_multiplier=1e-4, no_kl_epochs=5, accelerated=True):

    # load dataset file
    file = open('../../dataset/cmu-mosi/mosi_20_seq_data.pkl', 'rb')
    data = pickle.load(file)
    file.close()

    # prepare the datasets and data loaders
    train_set = Multimodal_Binary_Dataset(data['train']['text'], data['train']['audio'],
                                  data['train']['vision'], data['train']['labels'])
    valid_set = Multimodal_Binary_Dataset(data['valid']['text'], data['valid']['audio'],
                                  data['valid']['vision'], data['valid']['labels'])

    train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    valid_dataloader = DataLoader(valid_set, batch_size=len(valid_set))

    # set up model
    input_sizes = (train_set[0][0]['audio'].shape[0], train_set[0][0]['vision'].shape[0],
                   train_set[0][0]['text'].shape[1])
    hidden_sizes = (32, 32, 128)
    output_size = 1
    
    model = CP_Tensor_Fusion_Network(input_sizes, hidden_sizes, output_size, max_rank,
                                     rank_adaptive)
    # set up training
    DTYPE = torch.FloatTensor
    optimizer = optim.Adam(list(model.parameters()), lr=lr)
    criterion = nn.BCEWithLogitsLoss()
    
    # train and validate
    for e in range(1, epochs + 1):
        # train
        tic = time.time()
        model.train()
        train_loss = 0.0
        for batch in train_dataloader:
            model.zero_grad()

            features, label = batch
            
            x_a = Variable(features['audio'].float().type(DTYPE), requires_grad=False)
            x_v = Variable(features['vision'].float().type(DTYPE), requires_grad=False)
            x_t = Variable(features['text'].float().type(DTYPE), requires_grad=False)
            y = Variable(label.view(-1, 1).float().type(DTYPE), requires_grad=False)
            
            output = model([x_a, x_v, x_t])
            nll_loss = criterion(output, y)
            nll_loss.backward()
            optimizer.step()
            train_loss += nll_loss.item()
            
        print(train_loss)

In [3]:
train_cmu_mosi()



27.88590008020401
23.828711956739426
20.843992859125137
18.409786969423294
16.602273672819138
14.10087388753891
10.716878190636635
9.64777647703886
8.159154407680035
6.987912703305483
8.234772942960262
6.850535240024328
6.31910882377997
2.016575005836785
1.274231556802988
1.8493509874679148
5.899216145160608
1.9969846843741834
2.1268945937918033
0.620433097705245
1.121006367407972
1.554528527369257
0.5599256321511348
0.2029685300312849
0.2411186918270687
0.3446410106380142
0.3302184418619163
0.3213599407026777
1.0710114018711465
1.5202221790932526
0.5968834619416157
0.17277744274178986
0.30268214596799226
3.540458898336965
1.5768700855987845
0.29061798652401194
0.0926775376808564
0.12466045725341246
0.08646023813594184
0.018888536638769438
0.01930521933951468
0.01894114067385999
0.013641328444492729
0.008472226526265558
0.00522177584191752
0.002679475468319481
0.49273183114985386
2.386311451700749
1.0420152026927099
0.30027840349066537
0.06478347786559624
0.04379572102709517
0.37129397

In [48]:
y.shape

torch.Size([32, 1])

In [45]:
label.shape

torch.Size([32, 1, 1])