In [44]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

import numpy as np

import pickle

from tensor_layers import low_rank_tensors
from tensor_layers import TensorizedLinear

from model import SubNet, TextSubNet

In [4]:
class Adaptive_Rank_CP_Linear(nn.Module):

    def __init__(self, input_sizes, output_size, max_rank=20, em_stepsize=1.0, 
                 prior_type='log_uniform', eta=None):

        super(Adaptive_Rank_CP_Linear, self).__init__()

        self.input_sizes = input_sizes
        self.output_size = output_size
        
        shape = input_sizes + (output_size,)
        target_stddev = np.sqrt(2 / np.prod(self.input_sizes))
        self.tensor = getattr(low_rank_tensors, 'CP')(shape, prior_type=prior_type, em_stepsize=em_stepsize,
                                                      max_rank=max_rank, initialization_method='nn', 
                                                      target_stddev=target_stddev, learned_scale=False, 
                                                      eta=eta)

    def forward(self, inputs, rank_update=True):
        
        if self.training and rank_update:
            self.tensor.update_rank_parameters()
        
        # y = ((x_1 @ W_1)(x_2 @ W_2)...(x_M @ W_M)) @ W_y.T
        y = torch.ones(size=(1,))
        for i, x in enumerate(inputs):
            y = y * (x @ self.tensor.factors[i])
        y = y @ self.tensor.factors[-1].T

        return y

In [17]:
class Fixed_Rank_CP_Linear(nn.Module):
    
    def __init__(self, input_sizes, output_size, rank=20):
        
        super(Fixed_Rank_CP_Linear, self).__init__()
        
        self.input_sizes = input_sizes
        self.output_size = output_size
        self.rank = rank
        
        self.tensor_factors = self.initialize_tensor_factors()
        
    def forward(self, inputs):
        
        # y = ((x_1 @ W_1)(x_2 @ W_2)...(x_M @ W_M)) @ W_y.T
        y = torch.ones(size=(1,))
        for i, x in enumerate(inputs):
            y = y * (x @ self.tensor_factors[i])
        y = y @ self.tensor_factors[-1].T

        return y
        
    def initialize_tensor_factors(self):
        
        factors = []
        for m, input_size in enumerate(self.input_sizes):
            factors.append(nn.Parameter(torch.empty(input_size, self.rank)))
            nn.init.xavier_normal_(factors[m])
        factors.append(nn.Parameter(torch.empty(self.output_size, self.rank)))
        nn.init.xavier_normal_(factors[-1])
            
        return nn.ParameterList(factors)

In [46]:
class Old_CP_Tensor_Fusion_Network(nn.Module):
    '''
    Implements the Tensor Fusion Networks for multimodal sentiment analysis as is described in:
    Zadeh, Amir, et al. "Tensor fusion network for multimodal sentiment analysis." EMNLP 2017 Oral.
    with rank-adaptive tensorized training from Hawkins, Cole and Zheng Zhang "Bayesian tensorized
    neural networks with automatic rank selection." Neurocomputing 2021.
    '''

    def __init__(self, input_sizes, hidden_sizes, output_size, max_rank, em_stepsize=1.0, 
                 prior_type='log_uniform', eta=None):
        '''
        Args:
            input_sizes - a length-3 tuple that contains (x_1_size, x_2_size, x_3_size)
            hidden_sizes - a length-3 tuple that contains (hidden_size_1, hidden_size_2, hidden_size_3)
            output_size - an integer specifying the size of the output
            max_rank - an integer specifying the maximum rank of weight tensor
        '''

        super(Old_CP_Tensor_Fusion_Network, self).__init__()

        self.input_sizes = input_sizes
        self.hidden_sizes = hidden_sizes
        self.output_size = output_size
        self.max_rank = max_rank
        self.rank_adaptive = rank_adaptive
        
        self.audio_subnet = SubNet(input_sizes[0], hidden_sizes[0], dropout=0.3)
        self.video_subnet = SubNet(input_sizes[1], hidden_sizes[1], dropout=0.3)
        self.text_subnet = TextSubNet(input_sizes[2], hidden_sizes[2], hidden_sizes[2], dropout=0.3)
        
        input_sizes = (hidden_sizes[0] + 1, hidden_sizes[1] + 1, hidden_sizes[2] + 1)
        in_features = np.prod(input_sizes)
        shape = input_sizes + (output_size,)
        self.tensor_fusion_layer = TensorizedLinear(in_features, output_size, shape=shape,
                                                    tensor_type='CP', max_rank=max_rank, em_stepsize=1.0, 
                                                    prior_type='log_uniform', eta=None)

    def forward(self, inputs):
        
        # subnet outputs
        z_audio = self.audio_subnet(inputs[0])
        z_video = self.video_subnet(inputs[1])
        z_text = self.text_subnet(inputs[2])

        batch_size = z_audio.data.shape[0]

        if z_audio.is_cuda:
            DTYPE = torch.cuda.FloatTensor
        else:
            DTYPE = torch.FloatTensor

        # 1 in concatenated to each subnet outputs
        z_audio = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), z_audio), dim=1)
        z_video = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), z_video), dim=1)
        z_text = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), z_text), dim=1)

        output = self.tensor_fusion_layer([z_audio, z_video, z_text])

        return output

In [47]:
class CP_Tensor_Fusion_Network(nn.Module):
    '''
    Implements the Tensor Fusion Networks for multimodal sentiment analysis as is described in:
    Zadeh, Amir, et al. "Tensor fusion network for multimodal sentiment analysis." EMNLP 2017 Oral.
    with rank-adaptive tensorized training from Hawkins, Cole and Zheng Zhang "Bayesian tensorized
    neural networks with automatic rank selection." Neurocomputing 2021.
    '''

    def __init__(self, input_sizes, hidden_sizes, output_size, max_rank,
                 rank_adaptive=True, em_stepsize=1.0, prior_type='log_uniform',
                 eta=None):
        '''
        Args:
            input_sizes - a length-3 tuple that contains (x_1_size, x_2_size, x_3_size)
            hidden_sizes - a length-3 tuple that contains (hidden_size_1, hidden_size_2, hidden_size_3)
            output_size - an integer specifying the size of the output
            max_rank - an integer specifying the maximum rank of weight tensor
        '''

        super(CP_Tensor_Fusion_Network, self).__init__()

        self.input_sizes = input_sizes
        self.hidden_sizes = hidden_sizes
        self.output_size = output_size
        self.max_rank = max_rank
        self.rank_adaptive = rank_adaptive
        
        self.audio_subnet = SubNet(input_sizes[0], hidden_sizes[0], dropout=0.3)
        self.video_subnet = SubNet(input_sizes[1], hidden_sizes[1], dropout=0.3)
        self.text_subnet = TextSubNet(input_sizes[2], hidden_sizes[2], hidden_sizes[2], dropout=0.3)
        
        input_sizes = (hidden_sizes[0] + 1, hidden_sizes[1] + 1, hidden_sizes[2] + 1)
        
        if rank_adaptive:
            self.tensor_fusion_layer = Adaptive_Rank_CP_Linear(input_sizes, output_size, max_rank, 
                                                               em_stepsize, prior_type, eta)
        else:
            self.tensor_fusion_layer = Fixed_Rank_CP_Linear(input_sizes, output_size, max_rank)

    def forward(self, inputs):
        
        # subnet outputs
        z_audio = self.audio_subnet(inputs[0])
        z_video = self.video_subnet(inputs[1])
        z_text = self.text_subnet(inputs[2])

        batch_size = z_audio.data.shape[0]

        if z_audio.is_cuda:
            DTYPE = torch.cuda.FloatTensor
        else:
            DTYPE = torch.FloatTensor

        # 1 in concatenated to each subnet outputs
        z_audio = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), z_audio), dim=1)
        z_video = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), z_video), dim=1)
        z_text = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), z_text), dim=1)

        output = self.tensor_fusion_layer([z_audio, z_video, z_text])

        return output

In [48]:
class MultimodalDataset(Dataset):
    '''
    Dataset for CMU-MOSI
    '''
    def __init__(self, text, audio, vision, labels):
        '''
        args:
            text: text modality feature of shape (N, seq. length, text_input_size)
            audio: audio modality feature of shape (N, seq. length, audio_input_size)
            vision: vision modality feature of shape (N, seq. length, vision_input_size)
            labels: labels of shape (N, 1) and ranges from -3 to 3
        '''
        self.text = text
        self.audio = audio
        self.vision = vision
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        '''
        Returns an individual data composed of (features, label)
        where features is a dictionary {'text': , 'audio':, 'vision':}
        Returns:
            features['text']: text modality feature of shape (seq. length, text_input_size)
            features['audio']: audio modality feature of shape (audio_input_size)
            features['vision']: vision modality feature of shape (vision_input_size)
            label: a scalar label that ranges from -3 to 3
        '''
        features = dict()
        features['text'] = self.text[idx]
        # audio and vision features are averaged across time
        features['audio'] = np.mean(self.audio[idx], axis=0)
        features['vision'] = np.mean(self.vision[idx], axis=0)
        label = self.labels[idx]

        return features, label

In [49]:
def get_kl_loss(model, kl_multiplier, no_kl_epochs, warmup_epochs, epoch):
    '''
    kl loss for rank reduction
    '''
    kl_loss = 0.0
    for layer in model.modules():
        if hasattr(layer, "tensor"):
            kl_loss += layer.tensor.get_kl_divergence_to_prior()
            
    kl_mult = kl_multiplier * torch.clamp(torch.tensor(((epoch - no_kl_epochs) / 
                                                        warmup_epochs)), 0.0, 1.0)
    print("KL loss ",kl_loss.item())
    
    return kl_loss*kl_mult

In [51]:
def train_CMU_mosi(batch_size=32, epochs=100, lr=.001, max_rank=20, rank_adaptive=True,  
                   warmup_epochs=50, kl_multiplier=1e-4, no_kl_epochs=5):

    # load dataset file
    file = open('../../dataset/cmu-mosi/mosi_20_seq_data.pkl', 'rb')
    data = pickle.load(file)
    file.close()

    # prepare the datasets and data loaders
    train_set = MultimodalDataset(data['train']['text'], data['train']['audio'],
                                  data['train']['vision'], data['train']['labels'])
    valid_set = MultimodalDataset(data['valid']['text'], data['valid']['audio'],
                                  data['valid']['vision'], data['valid']['labels'])

    train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    valid_dataloader = DataLoader(valid_set, batch_size=len(valid_set))

    # set up model
    input_sizes = (train_set[0][0]['audio'].shape[0], train_set[0][0]['vision'].shape[0],
                   train_set[0][0]['text'].shape[1])
    hidden_sizes = (32, 32, 128)
    output_size = 1

    # model = CP_Tensor_Fusion_Network(input_sizes, hidden_sizes, output_size, max_rank,
    #                                 rank_adaptive)
    
    model = Old_CP_Tensor_Fusion_Network(input_sizes, hidden_sizes, output_size, max_rank)
    
    # set up training
    DTYPE = torch.FloatTensor
    optimizer = optim.Adam(list(model.parameters()), lr=lr)
    criterion = nn.MSELoss(size_average=False)
    
    # train and validate
    for e in range(1, epochs + 1):
        # train
        model.train()
        train_loss = 0.0
        for batch in train_dataloader:
            model.zero_grad()

            features, label = batch
            x_a = Variable(features['audio'].float().type(DTYPE), requires_grad=False)
            x_v = Variable(features['vision'].float().type(DTYPE), requires_grad=False)
            x_t = Variable(features['text'].float().type(DTYPE), requires_grad=False)
            y = Variable(label.view(-1, 1).float().type(DTYPE), requires_grad=False)

            output = model([x_a, x_v, x_t])

            loss = criterion(output, y)
            
            # rank loss for adaptive-rank model
            if rank_adaptive:
                loss += get_kl_loss(model, kl_multiplier, no_kl_epochs, warmup_epochs, e)
            
            train_loss += loss.item() / len(train_set)
            
            loss.backward()
            optimizer.step()

        print("Epoch {}".format(e))
        print("Training Loss {:.4f}".format(train_loss))
        
        # validate
        model.eval()
        for batch in valid_dataloader:
            features, label = batch
            x_a = Variable(features['audio'].float().type(DTYPE), requires_grad=False)
            x_v = Variable(features['vision'].float().type(DTYPE), requires_grad=False)
            x_t = Variable(features['text'].float().type(DTYPE), requires_grad=False)
            y = Variable(label.view(-1, 1).float().type(DTYPE), requires_grad=False)

            output = model([x_a, x_v, x_t])
        
        valid_error = nn.functional.mse_loss(y, output)
        print("Validation MSE {:4f}".format(valid_error.item()))

In [52]:
train_CMU_mosi(batch_size=32, epochs=100, lr=.001, max_rank=20, rank_adaptive=True,  
                   warmup_epochs=50, kl_multiplier=1e-4, no_kl_epochs=5)

TypeError: linear(): argument 'input' (position 1) must be Tensor, not list

In [38]:
train_CMU_mosi(batch_size=32, epochs=100, lr=.001, max_rank=20, rank_adaptive=False,  
                   warmup_epochs=50, kl_multiplier=1e-4, no_kl_epochs=5)

Epoch 1
Training Loss 2.1427
Validation MSE 0.009098
Epoch 2
Training Loss 1.5855
Validation MSE 0.008205
Epoch 3
Training Loss 1.3448
Validation MSE 0.006969
Epoch 4
Training Loss 1.1076
Validation MSE 0.006593
Epoch 5
Training Loss 0.9306
Validation MSE 0.006355
Epoch 6
Training Loss 0.8845
Validation MSE 0.006940
Epoch 7
Training Loss 0.7350
Validation MSE 0.006779
Epoch 8
Training Loss 0.6082
Validation MSE 0.006832
Epoch 9
Training Loss 0.5367
Validation MSE 0.006847
Epoch 10
Training Loss 0.4709
Validation MSE 0.007109
Epoch 11
Training Loss 0.4176
Validation MSE 0.006958
Epoch 12
Training Loss 0.3145
Validation MSE 0.007326
Epoch 13
Training Loss 0.2755
Validation MSE 0.006999
Epoch 14
Training Loss 0.2453
Validation MSE 0.007049
Epoch 15
Training Loss 0.2315
Validation MSE 0.007468
Epoch 16
Training Loss 0.2451
Validation MSE 0.006876
Epoch 17
Training Loss 0.1809
Validation MSE 0.007385
Epoch 18
Training Loss 0.1557
Validation MSE 0.007211
Epoch 19
Training Loss 0.1444
Validat