In [62]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

import numpy as np

import pickle

from tensor_layers import low_rank_tensors, TensorizedLinear

from model import SubNet, TextSubNet

import time

# Layers and Models

## CP Linear Layer with Accelerated Forward and Backward Propagation


$y = \left(\prod_{m=1}^M x_m W_m\right) \otimes W_{M+1}$\
\
For $m=1,2,...,M$ where $M$ is the total number of input modalities,\
$x_m$ is an input from modality $m$ with shape (batch_size, input_size[$m$]),\
$W_m$ is a weight tensor factor for modality $m$ with shape (input_size[$m$], max_rank), and\
$W_{M+1}$ is a weight tensor factor for the output ($M+1^{th}$) mode with shape (output_size, max_rank)\

### With Adaptive Rank Weight

In [50]:
class Adaptive_Rank_CP_Linear(nn.Module):

    def __init__(self, input_sizes, output_size, max_rank=20, em_stepsize=1.0, 
                 prior_type='log_uniform', eta=None):

        super(Adaptive_Rank_CP_Linear, self).__init__()

        self.input_sizes = input_sizes
        self.output_size = output_size
        
        shape = input_sizes + (output_size,)
        target_stddev = np.sqrt(2 / np.prod(self.input_sizes))
        self.weight_tensor = getattr(low_rank_tensors, 'CP')(shape, prior_type=prior_type, em_stepsize=em_stepsize,
                                                      max_rank=max_rank, initialization_method='nn', 
                                                      target_stddev=target_stddev, learned_scale=False, 
                                                      eta=eta)

    def forward(self, inputs, rank_update=True):
        
        if self.training and rank_update:
            self.weight_tensor.update_rank_parameters()
        
        y = torch.ones(size=(1,))
        for i, x in enumerate(inputs):
            y = y * (x @ self.weight_tensor.factors[i])
        y = y @ self.weight_tensor.factors[-1].T

        return y

### With Fixed Rank Weight

In [29]:
class Fixed_Rank_CP_Linear(nn.Module):
    
    def __init__(self, input_sizes, output_size, rank=20):
        
        super(Fixed_Rank_CP_Linear, self).__init__()
        
        self.input_sizes = input_sizes
        self.output_size = output_size
        self.rank = rank
        
        self.weight_tensor_factors = self.initialize_weight_tensor_factors()
        
    def forward(self, inputs):
        
        # y = ((x_1 @ W_1)(x_2 @ W_2)...(x_M @ W_M)) @ W_y.T
        y = torch.ones(size=(1,))
        for i, x in enumerate(inputs):
            y = y * (x @ self.weight_tensor_factors[i])
        y = y @ self.weight_tensor_factors[-1].T

        return y
        
    def initialize_weight_tensor_factors(self):
        
        factors = []
        for m, input_size in enumerate(self.input_sizes):
            factors.append(nn.Parameter(torch.empty(input_size, self.rank)))
            nn.init.xavier_normal_(factors[m])
        factors.append(nn.Parameter(torch.empty(self.output_size, self.rank)))
        nn.init.xavier_normal_(factors[-1])
            
        return nn.ParameterList(factors)

### Tensor Fusion Network with Accelerated CP Linear Layer

In [56]:
class CP_Tensor_Fusion_Network(nn.Module):
    '''
    Implements the Tensor Fusion Networks for multimodal sentiment analysis as is described in:
    Zadeh, Amir, et al. "Tensor fusion network for multimodal sentiment analysis." EMNLP 2017 Oral.
    with rank-adaptive tensorized training from Hawkins, Cole and Zheng Zhang "Bayesian tensorized
    neural networks with automatic rank selection." Neurocomputing 2021.
    '''

    def __init__(self, input_sizes, hidden_sizes, output_size, max_rank, 
                 rank_adaptive=True):
        '''
        Args:
            input_sizes - a length-3 tuple that contains (x_1_size, x_2_size, x_3_size)
            hidden_sizes - a length-3 tuple that contains (hidden_size_1, hidden_size_2, hidden_size_3)
            output_size - an integer specifying the size of the output
            max_rank - an integer specifying the maximum rank of weight tensor
        '''

        super(CP_Tensor_Fusion_Network, self).__init__()

        self.input_sizes = input_sizes
        self.hidden_sizes = hidden_sizes
        self.output_size = output_size
        self.max_rank = max_rank
        self.rank_adaptive = rank_adaptive
        
        self.audio_subnet = SubNet(input_sizes[0], hidden_sizes[0], dropout=0.3)
        self.video_subnet = SubNet(input_sizes[1], hidden_sizes[1], dropout=0.3)
        self.text_subnet = TextSubNet(input_sizes[2], hidden_sizes[2], hidden_sizes[2], dropout=0.3)
        
        tensor_input_sizes = (hidden_sizes[0] + 1, hidden_sizes[1] + 1, hidden_sizes[2] + 1)
        if rank_adaptive:
            self.tensor_fusion_layer = Adaptive_Rank_CP_Linear(tensor_input_sizes, output_size,
                                                               max_rank=max_rank, em_stepsize=1.0,
                                                               prior_type='log_uniform', eta=None)
        else:
            self.tensor_fusion_layer = Fixed_Rank_CP_Linear(tensor_input_sizes, output_size,
                                                            rank=max_rank)

    def forward(self, inputs):
        
        # subnet outputs
        z_audio = self.audio_subnet(inputs[0])
        z_video = self.video_subnet(inputs[1])
        z_text = self.text_subnet(inputs[2])

        batch_size = z_audio.data.shape[0]

        if z_audio.is_cuda:
            DTYPE = torch.cuda.FloatTensor
        else:
            DTYPE = torch.FloatTensor

        # 1 in concatenated to each subnet outputs
        z_audio = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), z_audio), dim=1)
        z_video = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), z_video), dim=1)
        z_text = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), z_text), dim=1)

        output = self.tensor_fusion_layer([z_audio, z_video, z_text])

        return output

## CP Linear Layer with Naive Forward and Backward Propagation

$ y = \mathcal X \mathcal W $

In [53]:
class TFN(nn.Module):

    def __init__(self, input_sizes, hidden_sizes, output_size, rank, rank_adaptive):
        
        super(TFN, self).__init__()

        # dimensions are specified in the order of audio, video and text
        self.input_sizes = input_sizes
        self.hidden_sizes = hidden_sizes
        self.output_size = output_size
        self.rank = rank

        # define the pre-fusion subnetworks
        self.audio_subnet = SubNet(input_sizes[0], hidden_sizes[0], dropout=0.3)
        self.video_subnet = SubNet(input_sizes[1], hidden_sizes[1], dropout=0.3)
        self.text_subnet = TextSubNet(input_sizes[2], hidden_sizes[2], hidden_sizes[2], dropout=0.3)
        
        tensor_input_sizes = (hidden_sizes[0] + 1, hidden_sizes[1] + 1, hidden_sizes[2] + 1)
        if rank_adaptive:
            shape = tensor_input_sizes + (output_size,)
            self.post_fusion_layer = TensorizedLinear(in_features=np.prod(tensor_input_sizes), 
                                                      out_features=output_size, 
                                                      shape=shape, 
                                                      tensor_type='CP', max_rank=rank)
        else:
            self.post_fusion_layer = nn.Linear(np.prod(tensor_input_sizes), output_size)
        
    def forward(self, inputs):
        '''
        Args:
            audio_x: tensor of shape (batch_size, audio_in)
            video_x: tensor of shape (batch_size, video_in)
            text_x: tensor of shape (batch_size, sequence_len, text_in)
        '''
        audio_h = self.audio_subnet(inputs[0])
        video_h = self.video_subnet(inputs[1])
        text_h = self.text_subnet(inputs[2])
        batch_size = audio_h.data.shape[0]

        # next we perform "tensor fusion", which is essentially appending 1s to the tensors and take Kronecker product
        if audio_h.is_cuda:
            DTYPE = torch.cuda.FloatTensor
        else:
            DTYPE = torch.FloatTensor
        
        # append 1s
        _audio_h = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE),
                                       requires_grad=False), audio_h), dim=1)
        _video_h = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE),
                                       requires_grad=False), video_h), dim=1)
        _text_h = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE),
                                      requires_grad=False), text_h), dim=1)
        
        
        
        # fusion_tensor will have shape (batch_size, (audio_hidden + 1) * (video_hidden + 1) * (text_out + 1))
        fusion_tensor = torch.bmm(_audio_h.unsqueeze(2), _video_h.unsqueeze(1))
        fusion_tensor = fusion_tensor.view(-1, (self.hidden_sizes[0] + 1) * (self.hidden_sizes[1] + 1), 1)
        fusion_tensor = torch.bmm(fusion_tensor, _text_h.unsqueeze(1)).view(batch_size, -1)
                                  
        output = self.post_fusion_layer(fusion_tensor)
        
        return output

# Utils

## CMU-MOSI Dataset

In [32]:
class MultimodalDataset(Dataset):
    '''
    Dataset for CMU-MOSI
    '''
    def __init__(self, text, audio, vision, labels):
        '''
        args:
            text: text modality feature of shape (N, seq. length, text_input_size)
            audio: audio modality feature of shape (N, seq. length, audio_input_size)
            vision: vision modality feature of shape (N, seq. length, vision_input_size)
            labels: labels of shape (N, 1) and ranges from -3 to 3
        '''
        self.text = text
        self.audio = audio
        self.vision = vision
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        '''
        Returns an individual data composed of (features, label)
        where features is a dictionary {'text': , 'audio':, 'vision':}
        Returns:
            features['text']: text modality feature of shape (seq. length, text_input_size)
            features['audio']: audio modality feature of shape (audio_input_size)
            features['vision']: vision modality feature of shape (vision_input_size)
            label: a scalar label that ranges from -3 to 3
        '''
        features = dict()
        features['text'] = self.text[idx]
        # audio and vision features are averaged across time
        features['audio'] = np.mean(self.audio[idx], axis=0)
        features['vision'] = np.mean(self.vision[idx], axis=0)
        label = self.labels[idx]

        return features, label

## KL Loss for Rank Reduction

In [58]:
def get_kl_loss(model, kl_multiplier, no_kl_epochs, warmup_epochs, epoch):
    kl_loss = 0.0
    for layer in model.modules():
        if hasattr(layer, "tensor"):
            kl_loss += layer.tensor.get_kl_divergence_to_prior()
        elif hasattr(layer, "weight_tensor"):
            kl_loss += layer.weight_tensor.get_kl_divergence_to_prior()
            
    kl_mult = kl_multiplier * torch.clamp(torch.tensor(((epoch - no_kl_epochs) / 
                                                        warmup_epochs)), 0.0, 1.0)
    
    return kl_loss*kl_mult

## Train

In [71]:
def train_CMU_mosi(batch_size=32, epochs=100, lr=.001, max_rank=20, rank_adaptive=True,  
                   warmup_epochs=50, kl_multiplier=1e-4, no_kl_epochs=5, accelerated=True):

    # load dataset file
    file = open('../../dataset/CMU-MOSI/mosi_20_seq_data.pkl', 'rb')
    data = pickle.load(file)
    file.close()

    # prepare the datasets and data loaders
    train_set = MultimodalDataset(data['train']['text'], data['train']['audio'],
                                  data['train']['vision'], data['train']['labels'])
    valid_set = MultimodalDataset(data['valid']['text'], data['valid']['audio'],
                                  data['valid']['vision'], data['valid']['labels'])

    train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    valid_dataloader = DataLoader(valid_set, batch_size=len(valid_set))

    # set up model
    input_sizes = (train_set[0][0]['audio'].shape[0], train_set[0][0]['vision'].shape[0],
                   train_set[0][0]['text'].shape[1])
    hidden_sizes = (32, 32, 128)
    output_size = 1
    
    if accelerated:
        model = CP_Tensor_Fusion_Network(input_sizes, hidden_sizes, output_size, max_rank,
                                         rank_adaptive)
    else:
        model = TFN(input_sizes, hidden_sizes, output_size, max_rank, rank_adaptive)

    # set up training
    DTYPE = torch.FloatTensor
    optimizer = optim.Adam(list(model.parameters()), lr=lr)
    criterion = nn.MSELoss(size_average=False)
    
    train_losses = []
    train_times = []
    valid_errors = []
    
    # train and validate
    for e in range(1, epochs + 1):
        # train
        tic = time.time()
        model.train()
        train_loss = 0.0
        for batch in train_dataloader:
            model.zero_grad()

            features, label = batch
            x_a = Variable(features['audio'].float().type(DTYPE), requires_grad=False)
            x_v = Variable(features['vision'].float().type(DTYPE), requires_grad=False)
            x_t = Variable(features['text'].float().type(DTYPE), requires_grad=False)
            y = Variable(label.view(-1, 1).float().type(DTYPE), requires_grad=False)

            output = model([x_a, x_v, x_t])

            loss = criterion(output, y)

            # rank loss for adaptive-rank model
            if rank_adaptive:
                loss += get_kl_loss(model, kl_multiplier, no_kl_epochs, warmup_epochs, e)

            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() / len(train_set)
        toc = time.time()
        train_time = toc - tic
        print("Epoch {}".format(e))
        print("Training Loss {:.3f}".format(train_loss))
        print("Training TIme {:.3f}".format(train_time))
        train_losses.append(train_loss)
        train_times.append(train_time)
        
        # validate
        model.eval()
        for batch in valid_dataloader:
            features, label = batch
            x_a = Variable(features['audio'].float().type(DTYPE), requires_grad=False)
            x_v = Variable(features['vision'].float().type(DTYPE), requires_grad=False)
            x_t = Variable(features['text'].float().type(DTYPE), requires_grad=False)
            y = Variable(label.view(-1, 1).float().type(DTYPE), requires_grad=False)

            output = model([x_a, x_v, x_t])
            
        valid_mse = nn.functional.mse_loss(y, output).item()
        print("Valid MSE {:.3f}".format(valid_mse))
        valid_errors.append(valid_mse)
    
    np.savetxt('Ada-{}-Acc-{}-train-losses.csv'.format(rank_adaptive, accelerated), train_losses, delimiter=',')
    np.savetxt('Ada-{}-Acc-{}-train-times.csv'.format(rank_adaptive, accelerated), train_times, delimiter=',')
    np.savetxt('Ada-{}-Acc-{}-valid-errors.csv'.format(rank_adaptive, accelerated), valid_errors, delimiter=',')

# Experiments

## Experiment with Naive, Adaptive Rank

In [67]:
train_CMU_mosi(batch_size=32, epochs=100, lr=.001, max_rank=20, rank_adaptive=True,  
                   warmup_epochs=50, kl_multiplier=1e-4, no_kl_epochs=5, accelerated=False)

Epoch 1
Training Loss 2.203
Training TIme 0.763
Valid MSE 2.256
Epoch 2
Training Loss 1.571
Training TIme 0.747
Valid MSE 1.736
Epoch 3
Training Loss 1.369
Training TIme 0.744
Valid MSE 1.646
Epoch 4
Training Loss 1.176
Training TIme 0.748
Valid MSE 1.530
Epoch 5
Training Loss 1.018
Training TIme 0.735
Valid MSE 1.472
Epoch 6
Training Loss 0.858
Training TIme 0.735
Valid MSE 1.560
Epoch 7
Training Loss 0.788
Training TIme 0.740
Valid MSE 1.730
Epoch 8
Training Loss 0.841
Training TIme 0.738
Valid MSE 1.489
Epoch 9
Training Loss 0.659
Training TIme 0.736
Valid MSE 1.550
Epoch 10
Training Loss 0.574
Training TIme 0.743
Valid MSE 1.568
Epoch 11
Training Loss 0.478
Training TIme 0.738
Valid MSE 1.657
Epoch 12
Training Loss 0.442
Training TIme 0.737
Valid MSE 1.679
Epoch 13
Training Loss 0.386
Training TIme 0.733
Valid MSE 1.653
Epoch 14
Training Loss 0.343
Training TIme 0.753
Valid MSE 1.614
Epoch 15
Training Loss 0.297
Training TIme 0.736
Valid MSE 1.706
Epoch 16
Training Loss 0.306
Train

## Experiment with Accelerated, Adaptive Rank

In [72]:
train_CMU_mosi(batch_size=32, epochs=100, lr=.001, max_rank=20, rank_adaptive=True,  
                   warmup_epochs=50, kl_multiplier=1e-4, no_kl_epochs=5, accelerated=True)

Epoch 1
Training Loss 2.200
Training TIme 0.452
Valid MSE 2.279
Epoch 2
Training Loss 1.615
Training TIme 0.408
Valid MSE 1.888
Epoch 3
Training Loss 1.383
Training TIme 0.401
Valid MSE 1.852
Epoch 4
Training Loss 1.189
Training TIme 0.403
Valid MSE 1.563
Epoch 5
Training Loss 1.012
Training TIme 0.406
Valid MSE 1.459
Epoch 6
Training Loss 0.864
Training TIme 0.405
Valid MSE 1.488
Epoch 7
Training Loss 0.800
Training TIme 0.407
Valid MSE 1.494
Epoch 8
Training Loss 0.677
Training TIme 0.406
Valid MSE 1.544
Epoch 9
Training Loss 0.561
Training TIme 0.409
Valid MSE 1.589
Epoch 10
Training Loss 0.497
Training TIme 0.399
Valid MSE 1.618
Epoch 11
Training Loss 0.437
Training TIme 0.403
Valid MSE 1.668
Epoch 12
Training Loss 0.365
Training TIme 0.408
Valid MSE 1.580
Epoch 13
Training Loss 0.285
Training TIme 0.398
Valid MSE 1.663
Epoch 14
Training Loss 0.310
Training TIme 0.412
Valid MSE 1.614
Epoch 15
Training Loss 0.283
Training TIme 0.408
Valid MSE 1.563
Epoch 16
Training Loss 0.245
Train

## Experiment with Naive, Fixed Rank

In [73]:
train_CMU_mosi(batch_size=32, epochs=100, lr=.001, max_rank=20, rank_adaptive=False,  
                   warmup_epochs=50, kl_multiplier=1e-4, no_kl_epochs=5, accelerated=False)

Epoch 1
Training Loss 2.049
Training TIme 0.714
Valid MSE 1.925
Epoch 2
Training Loss 1.463
Training TIme 0.699
Valid MSE 1.865
Epoch 3
Training Loss 1.209
Training TIme 0.702
Valid MSE 1.670
Epoch 4
Training Loss 1.085
Training TIme 0.729
Valid MSE 1.711
Epoch 5
Training Loss 0.924
Training TIme 0.706
Valid MSE 1.608
Epoch 6
Training Loss 0.786
Training TIme 0.704
Valid MSE 1.612
Epoch 7
Training Loss 0.719
Training TIme 0.714
Valid MSE 1.861
Epoch 8
Training Loss 0.627
Training TIme 0.708
Valid MSE 1.562
Epoch 9
Training Loss 0.539
Training TIme 0.706
Valid MSE 1.704
Epoch 10
Training Loss 0.443
Training TIme 0.705
Valid MSE 1.596
Epoch 11
Training Loss 0.389
Training TIme 0.709
Valid MSE 1.628
Epoch 12
Training Loss 0.372
Training TIme 0.695
Valid MSE 1.585
Epoch 13
Training Loss 0.323
Training TIme 0.726
Valid MSE 1.804
Epoch 14
Training Loss 0.309
Training TIme 0.755
Valid MSE 1.730
Epoch 15
Training Loss 0.293
Training TIme 0.706
Valid MSE 1.706
Epoch 16
Training Loss 0.280
Train

## Experiment with Accelerated, Fixed Rank

In [74]:
train_CMU_mosi(batch_size=32, epochs=100, lr=.001, max_rank=20, rank_adaptive=False,  
                   warmup_epochs=50, kl_multiplier=1e-4, no_kl_epochs=5, accelerated=True)

Epoch 1
Training Loss 2.105
Training TIme 0.410
Valid MSE 2.071
Epoch 2
Training Loss 1.618
Training TIme 0.436
Valid MSE 1.807
Epoch 3
Training Loss 1.284
Training TIme 0.410
Valid MSE 1.573
Epoch 4
Training Loss 1.126
Training TIme 0.395
Valid MSE 1.517
Epoch 5
Training Loss 0.955
Training TIme 0.390
Valid MSE 1.535
Epoch 6
Training Loss 0.823
Training TIme 0.399
Valid MSE 1.628
Epoch 7
Training Loss 0.695
Training TIme 0.426
Valid MSE 1.772
Epoch 8
Training Loss 0.616
Training TIme 0.409
Valid MSE 1.664
Epoch 9
Training Loss 0.543
Training TIme 0.397
Valid MSE 1.629
Epoch 10
Training Loss 0.492
Training TIme 0.407
Valid MSE 1.598
Epoch 11
Training Loss 0.407
Training TIme 0.389
Valid MSE 1.646
Epoch 12
Training Loss 0.325
Training TIme 0.423
Valid MSE 1.661
Epoch 13
Training Loss 0.299
Training TIme 0.397
Valid MSE 1.656
Epoch 14
Training Loss 0.239
Training TIme 0.410
Valid MSE 1.595
Epoch 15
Training Loss 0.202
Training TIme 0.424
Valid MSE 1.711
Epoch 16
Training Loss 0.202
Train