In [11]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

from sklearn.metrics import mean_squared_error

from model import SubNet, TextSubNet, TFN

import pickle

import numpy as np

import time

In [2]:
class CP_Linear_Function(torch.autograd.Function):
    
    @staticmethod
    def forward(ctx, x_1, x_2, x_3, W_1, W_2, W_3, W_y):
        
        A_1 = x_1 @ W_1
        A_2 = x_2 @ W_2
        A_3 = x_3 @ W_3
        
        A_f = A_1 * A_2 * A_3
        
        y_hat = A_f @ W_y.T
        
        ctx.save_for_backward(x_1, x_2, x_3, W_1, W_2, W_3, W_y, A_1, A_2, A_3, A_f)
        
        return y_hat
    
    @staticmethod
    def backward(ctx, grad_y_hat):
        
        x_1, x_2, x_3, W_1, W_2, W_3, W_y, A_1, A_2, A_3, A_f = ctx.saved_tensors
        
        grad_x_1 = grad_x_2 = grad_x_3 = grad_W_1 = grad_W_2 = grad_W_3 = grad_W_y = None
        
        grad_W_y = grad_y_hat.T @ A_f
        
        grad_A_f = grad_y_hat @ W_y
        
        grad_A_1 = grad_A_f * A_2 * A_3
        grad_A_2 = grad_A_f * A_1 * A_3
        grad_A_3 = grad_A_f * A_1 * A_2
        
        grad_W_1 = x_1.T @ grad_A_1
        grad_W_2 = x_2.T @ grad_A_2
        grad_W_3 = x_3.T @ grad_A_3
        
        grad_x_1 = grad_A_1 @ W_1.T
        grad_x_2 = grad_A_2 @ W_2.T
        grad_x_3 = grad_A_3 @ W_3.T
        
        return grad_x_1, grad_x_2, grad_x_3, grad_W_1, grad_W_2, grad_W_3, grad_W_y

In [3]:
class CP_Linear(nn.Module):
    
    def __init__(self, input_sizes, output_size, rank):
        
        super(CP_Linear, self).__init__()
        
        self.W_1 = nn.Parameter(torch.rand((input_sizes[0], rank)))
        self.W_2 = nn.Parameter(torch.rand((input_sizes[1], rank)))
        self.W_3 = nn.Parameter(torch.rand((input_sizes[2], rank)))
        self.W_y = nn.Parameter(torch.rand((output_size, rank)))
        
    def forward(self, inputs):
        
        return CP_Linear_Function.apply(inputs[0], inputs[1], inputs[2], 
                                        self.W_1, self.W_2, self.W_3, self.W_y)

In [4]:
class CP_Fixed_Rank_Tensor_Fusion_Network(nn.Module):
    
    def __init__(self, input_sizes, hidden_sizes, output_size, rank):
        
        super(CP_Fixed_Rank_Tensor_Fusion_Network, self).__init__()
        
        self.input_sizes = input_sizes
        self.hidden_sizes = hidden_sizes
        self.output_size = output_size
        self.rank = rank
        
        self.audio_subnet = SubNet(input_sizes[0], hidden_sizes[0], dropout=0.3)
        self.video_subnet = SubNet(input_sizes[1], hidden_sizes[1], dropout=0.3)
        self.text_subnet = TextSubNet(input_sizes[2], hidden_sizes[2], hidden_sizes[2], 
                                      dropout=0.3)
        
        fusion_input_shape = (hidden_sizes[0]+1, hidden_sizes[1]+1, hidden_sizes[2]+1)
        self.tensor_fusion_layer = CP_Linear(fusion_input_shape, output_size, rank)
        
    def forward(self, inputs):
        
        z_audio = self.audio_subnet(inputs[0])
        z_video = self.video_subnet(inputs[1])
        z_text = self.text_subnet(inputs[2])
        
        batch_size = z_audio.data.shape[0]
        
        if z_audio.is_cuda:
            DTYPE = torch.cuda.FloatTensor
        else:
            DTYPE = torch.FloatTensor
        
        # 1 in concatenated to each subnet outputs
        z_audio = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), z_audio), dim=1)
        z_video = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), z_video), dim=1)
        z_text = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), z_text), dim=1)
        
        output = self.tensor_fusion_layer([z_audio, z_video, z_text])
        
        return output

In [5]:
class LMF(nn.Module):
    
    def __init__(self, input_sizes, hidden_sizes, output_size, rank):
        
        super(LMF, self).__init__()
        
        self.input_sizes = input_sizes
        self.hidden_sizes = hidden_sizes
        self.output_size = output_size
        self.rank = rank
        
        self.audio_subnet = SubNet(input_sizes[0], hidden_sizes[0], dropout=0.3)
        self.video_subnet = SubNet(input_sizes[1], hidden_sizes[1], dropout=0.3)
        self.text_subnet = TextSubNet(input_sizes[2], hidden_sizes[2], hidden_sizes[2], 
                                      dropout=0.3)
        
        self.W_1 = nn.Parameter(torch.rand((hidden_sizes[0]+1, rank)))
        self.W_2 = nn.Parameter(torch.rand((hidden_sizes[1]+1, rank)))
        self.W_3 = nn.Parameter(torch.rand((hidden_sizes[2]+1, rank)))
        self.W_y = nn.Parameter(torch.rand((output_size, rank)))
        
    def forward(self, inputs):
                
        z_audio = self.audio_subnet(inputs[0])
        z_video = self.video_subnet(inputs[1])
        z_text = self.text_subnet(inputs[2])
        
        batch_size = z_audio.data.shape[0]
        
        if z_audio.is_cuda:
            DTYPE = torch.cuda.FloatTensor
        else:
            DTYPE = torch.FloatTensor
        
        # 1 in concatenated to each subnet outputs
        z_audio = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), z_audio), dim=1)
        z_video = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), z_video), dim=1)
        z_text = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), z_text), dim=1)
        
        A_1 = z_audio @ self.W_1
        A_2 = z_video @ self.W_2
        A_3 = z_text @ self.W_3
        
        A_f = A_1 * A_2 * A_3
        
        output = A_f @ self.W_y.T
        
        return output

In [6]:
class MultimodalDataset(Dataset):
    '''
    Dataset for CMU-MOSI
    '''
    def __init__(self, text, audio, vision, labels):
        '''
        args:
            text: text modality feature of shape (N, seq. length, text_input_size)
            audio: audio modality feature of shape (N, seq. length, audio_input_size)
            vision: vision modality feature of shape (N, seq. length, vision_input_size)
            labels: labels of shape (N, 1) and ranges from -3 to 3
        '''
        self.text = text
        self.audio = audio
        self.vision = vision
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        '''
        Returns an individual data composed of (features, label)
        where features is a dictionary {'text': , 'audio':, 'vision':}

        Returns:
            features['text']: text modality feature of shape (seq. length, text_input_size)
            features['audio']: audio modality feature of shape (audio_input_size)
            features['vision']: vision modality feature of shape (vision_input_size)

            label: a scalar label that ranges from -3 to 3
        '''
        features = dict()
        features['text'] = self.text[idx]
        # audio and vision features are averaged across time
        features['audio'] = np.mean(self.audio[idx], axis=0)
        features['vision'] = np.mean(self.vision[idx], axis=0)
        label = self.labels[idx]

        return features, label

In [15]:
def fixed_rank_train_CMU_mosi(model_type, batch_size=32, epochs=100, max_rank=20, lr=.001):

    # load dataset file
    file = open('mosi_20_seq_data.pkl', 'rb')
    data = pickle.load(file)
    file.close()

    # prepare the datasets and data loaders
    train_set = MultimodalDataset(data['train']['text'], data['train']['audio'],
                                  data['train']['vision'], data['train']['labels'])
    valid_set = MultimodalDataset(data['valid']['text'], data['valid']['audio'],
                                  data['valid']['vision'], data['valid']['labels'])

    train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    valid_dataloader = DataLoader(valid_set, batch_size=len(valid_set))

    # set up model
    input_sizes = (train_set[0][0]['audio'].shape[0], train_set[0][0]['vision'].shape[0],
                   train_set[0][0]['text'].shape[1])
    hidden_sizes = (32, 32, 128)
    output_size = 1
    
    if model_type == 'custom':
        model = CP_Fixed_Rank_Tensor_Fusion_Network(input_sizes, hidden_sizes, output_size, max_rank)
    elif model_type == 'pytorch':
        model = LMF(input_sizes, hidden_sizes, output_size, max_rank)
    elif model_type == 'full':
        model = TFN(input_sizes, hidden_sizes, 64, (0.3, 0.3, 0.3, 0.0), 32)
    
    # set up training
    DTYPE = torch.FloatTensor
    optimizer = optim.Adam(list(model.parameters()), lr=lr)
    criterion = nn.MSELoss(size_average=False)
    
    train_losses = []
    train_times = []
    valid_errors = []
    
    # train and validate
    for e in range(1, epochs + 1):
        # train
        model.train()
        train_loss = 0.0
        
        start = time.time()
        for batch in train_dataloader:
            model.zero_grad()

            features, label = batch
            x_a = Variable(features['audio'].float().type(DTYPE), requires_grad=False)
            x_v = Variable(features['vision'].float().type(DTYPE), requires_grad=False)
            x_t = Variable(features['text'].float().type(DTYPE), requires_grad=False)
            y = Variable(label.view(-1, 1).float().type(DTYPE), requires_grad=False)
            
            if model_type == 'full':
                output = model(x_a, x_v, x_t)
            else:
                output = model([x_a, x_v, x_t])

            loss = criterion(output, y)

            loss.backward()
            train_loss += loss.item() / len(train_set)

            optimizer.step()
        
        train_time = time.time() - start
        
        print("Epoch {}".format(e))
        print("Training Loss {:.2f}".format(train_loss))
        print("Training Time {:.2f}".format(train_time))
        
        train_losses.append(train_loss)
        train_times.append(train_time)
        
        # validate
        model.eval()
        for batch in valid_dataloader:
            features, label = batch
            x_a = Variable(features['audio'].float().type(DTYPE), requires_grad=False)
            x_v = Variable(features['vision'].float().type(DTYPE), requires_grad=False)
            x_t = Variable(features['text'].float().type(DTYPE), requires_grad=False)
            y = Variable(label.view(-1, 1).float().type(DTYPE), requires_grad=False)

            output = model([x_a, x_v, x_t])

        output_valid = output.detach().numpy().reshape(-1)
        y = y.numpy().reshape(-1)

        # validation mean squared error
        valid_mse = mean_squared_error(output_valid, y)
        print("Validation MSE {:.2f}".format(valid_mse))
        
        valid_errors.append(valid_mse)
        
    np.savetxt('train_losses.csv', train_losses, delimiter=',')
    np.savetxt('train_times.csv', train_times, delimiter=',')
    np.savetxt('valid_errors.csv', valid_errors, delimiter=',')

In [16]:
fixed_rank_train_CMU_mosi(model_type='full')



Epoch 1
Training Loss 2.03
Training Time 1.31


TypeError: forward() missing 2 required positional arguments: 'video_x' and 'text_x'

In [10]:
fixed_rank_train_CMU_mosi(model_type='pytorch')



Epoch 1
Training Loss 79.94
Training Time 0.39
Validation MSE 4.20
Epoch 2
Training Loss 3.98
Training Time 0.38
Validation MSE 2.88
Epoch 3
Training Loss 2.88
Training Time 0.37
Validation MSE 2.64
Epoch 4
Training Loss 2.30
Training Time 0.37
Validation MSE 2.48
Epoch 5
Training Loss 2.12
Training Time 0.37
Validation MSE 2.40
Epoch 6
Training Loss 2.03
Training Time 0.41
Validation MSE 2.35
Epoch 7
Training Loss 1.91
Training Time 0.40
Validation MSE 2.36
Epoch 8
Training Loss 1.87
Training Time 0.38
Validation MSE 2.28
Epoch 9
Training Loss 1.81
Training Time 0.38
Validation MSE 2.25
Epoch 10
Training Loss 1.74
Training Time 0.38
Validation MSE 2.23
Epoch 11
Training Loss 1.77
Training Time 0.38
Validation MSE 2.20
Epoch 12
Training Loss 1.66
Training Time 0.37
Validation MSE 2.18
Epoch 13
Training Loss 1.63
Training Time 0.37
Validation MSE 2.17
Epoch 14
Training Loss 1.58
Training Time 0.38
Validation MSE 2.13
Epoch 15
Training Loss 1.58
Training Time 0.39
Validation MSE 2.10
Epo

In [8]:
fixed_rank_train_CMU_mosi(model_type='custom')



Epoch 1
Training Loss 36.29
Training Time 0.41
Validation MSE 3.31
Epoch 2
Training Loss 3.10
Training Time 0.40
Validation MSE 2.64
Epoch 3
Training Loss 2.54
Training Time 0.39
Validation MSE 2.47
Epoch 4
Training Loss 2.27
Training Time 0.38
Validation MSE 2.42
Epoch 5
Training Loss 2.04
Training Time 0.38
Validation MSE 2.28
Epoch 6
Training Loss 1.92
Training Time 0.38
Validation MSE 2.23
Epoch 7
Training Loss 1.84
Training Time 0.44
Validation MSE 2.30
Epoch 8
Training Loss 1.80
Training Time 0.41
Validation MSE 2.13
Epoch 9
Training Loss 1.69
Training Time 0.41
Validation MSE 2.09
Epoch 10
Training Loss 1.62
Training Time 0.45
Validation MSE 2.05
Epoch 11
Training Loss 1.53
Training Time 0.38
Validation MSE 2.06
Epoch 12
Training Loss 1.50
Training Time 0.38
Validation MSE 1.94
Epoch 13
Training Loss 1.41
Training Time 0.38
Validation MSE 1.89
Epoch 14
Training Loss 1.35
Training Time 0.38
Validation MSE 1.90
Epoch 15
Training Loss 1.29
Training Time 0.38
Validation MSE 1.86
Epo

In [9]:
rank = 10
input_sizes = (10, 20, 30)
output_size = 2
batch_size = 64
X_1 = torch.rand((batch_size, input_sizes[0]))
X_2 = torch.rand((batch_size, input_sizes[1]))
X_3 = torch.rand((batch_size, input_sizes[2]))

In [10]:
layer = CP_Linear(input_sizes, output_size, rank)

In [11]:
y_hat = layer([X_1, X_2, X_3])

In [12]:
y = torch.ones((batch_size, output_size))
loss = torch.mean((y - y_hat)**2)
loss.backward()

In [14]:
W_1 = torch.tensor(layer.W_1.clone().detach(), requires_grad=True)
W_2 = torch.tensor(layer.W_2.clone().detach(), requires_grad=True)
W_3 = torch.tensor(layer.W_3.clone().detach(), requires_grad=True)
W_y = torch.tensor(layer.W_y.clone().detach(), requires_grad=True)

A_1 = X_1 @ W_1
A_2 = X_2 @ W_2
A_3 = X_3 @ W_3
A_1.retain_grad()
A_2.retain_grad()
A_3.retain_grad()

A_f = A_1 * A_2 * A_3
A_f.retain_grad()

y_hat = A_f @ W_y.T
y_hat.retain_grad()

  W_1 = torch.tensor(layer.W_1.clone().detach(), requires_grad=True)
  W_2 = torch.tensor(layer.W_2.clone().detach(), requires_grad=True)
  W_3 = torch.tensor(layer.W_3.clone().detach(), requires_grad=True)
  W_y = torch.tensor(layer.W_y.clone().detach(), requires_grad=True)


In [15]:
loss = torch.mean((y - y_hat)**2)
loss.backward()

In [16]:
layer.W_1.grad - W_1.grad

tensor([[ 0.0005,  0.0000,  0.0000,  0.0000,  0.0002,  0.0000,  0.0000, -0.0005,
          0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0010,  0.0000,  0.0000,  0.0000,  0.0000,  0.0005,
         -0.0005,  0.0000],
        [ 0.0000, -0.0010, -0.0010, -0.0001, -0.0002,  0.0005,  0.0000,  0.0000,
          0.0005,  0.0002],
        [ 0.0000,  0.0000, -0.0010,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0005],
        [ 0.0000,  0.0000,  0.0000, -0.0001, -0.0002,  0.0000,  0.0000,  0.0000,
         -0.0005,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000, -0.0002, -0.0005,  0.0000,  0.0000,
          0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0020,  0.0000, -0.0002,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000, -0.0005,  0.0000,  0.0000,
          0.0000,  0.0000],
        [ 0.0000,  0.0010,  0.0000,  0.0000,  0.0002,  0.0000, -0.0010,  0.0005,
         -0.0005,  0.0005],
        [ 0.0000,  

In [17]:
layer.W_2.grad - W_2.grad

tensor([[ 0.0000e+00,  0.0000e+00,  4.8828e-04,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  2.4414e-04,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00, -4.8828e-04, -6.1035e-05,  1.2207e-04,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  2.4414e-04],
        [ 0.0000e+00,  0.0000e+00, -4.8828e-04,  0.0000e+00,  1.2207e-04,
          2.4414e-04,  0.0000e+00, -2.4414e-04,  2.4414e-04,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  1.2207e-04,  0.0000e+00,
          0.0000e+00,  2.4414e-04,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -2.4414e-04,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  4.8828e-04,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 2.4414e-04,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+0

In [18]:
layer.W_3.grad - W_3.grad

tensor([[ 2.4414e-04,  0.0000e+00,  0.0000e+00, -3.0518e-05,  6.1035e-05,
         -1.2207e-04,  1.2207e-04,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00, -4.8828e-04,  0.0000e+00, -6.1035e-05,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  2.4414e-04,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  3.0518e-05,  0.0000e+00,
          0.0000e+00,  2.4414e-04,  0.0000e+00,  2.4414e-04, -1.2207e-04],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00, -1.2207e-04,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00, -4.8828e-04,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  1.2207e-04,  0.0000e+00,  2.4414e-04,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  6.1035e-05,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00, -6.1035e-05, -6.1035e-05,
          0.0000e+00, -1.2207e-0

In [19]:
layer.W_y.grad - W_y.grad

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])