In [7]:
class TextSubNet(nn.Module):
    '''
    The LSTM-based subnetwork that is used in TFN for text
    '''

    def __init__(self, in_size, hidden_size, out_size, num_layers=1, dropout=0.2, bidirectional=False):
        '''
        Args:
            in_size: input dimension
            hidden_size: hidden layer dimension
            num_layers: specify the number of layers of LSTMs.
            dropout: dropout probability
            bidirectional: specify usage of bidirectional LSTM
        Output:
            (return value in forward) a tensor of shape (batch_size, out_size)
        '''
        super(TextSubNet, self).__init__()
        self.rnn = nn.LSTM(in_size, hidden_size, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.linear_1 = nn.Linear(hidden_size, out_size)

    def forward(self, x):
        '''
        Args:
            x: tensor of shape (batch_size, sequence_len, in_size)
        '''
        _, final_states = self.rnn(x)
        h = self.dropout(final_states[0].squeeze())
        y_1 = self.linear_1(h)
        return y_1


class TFN(nn.Module):
    '''
    Implements the Tensor Fusion Networks for multimodal sentiment analysis as is described in:
    Zadeh, Amir, et al. "Tensor fusion network for multimodal sentiment analysis." EMNLP 2017 Oral.
    '''

    def __init__(self, input_dims, hidden_dims, text_out, dropouts, post_fusion_dim):
        '''
        Args:
            input_dims - a length-3 tuple, contains (audio_dim, video_dim, text_dim)
            hidden_dims - another length-3 tuple, similar to input_dims
            text_out - int, specifying the resulting dimensions of the text subnetwork
            dropouts - a length-4 tuple, contains (audio_dropout, video_dropout, text_dropout, post_fusion_dropout)
            post_fusion_dim - int, specifying the size of the sub-networks after tensorfusion
        Output:
            (return value in forward) a scalar value between -3 and 3
        '''
        super(TFN, self).__init__()

        # dimensions are specified in the order of audio, video and text
        self.audio_in = input_dims[0]
        self.text_in = input_dims[1]

        self.audio_hidden = hidden_dims[0]
        self.text_hidden = hidden_dims[1]
        self.audio_out = text_out[0]
        self.text_out= text_out[1]
        self.post_fusion_dim = post_fusion_dim

        self.audio_prob = dropouts[0]
        self.text_prob = dropouts[1]
        self.post_fusion_prob = dropouts[2]

        # define the pre-fusion subnetworks
        self.audio_subnet = TextSubNet(self.audio_in, self.audio_hidden, self.audio_out, dropout=self.audio_prob)
        self.text_subnet = TextSubNet(self.text_in, self.text_hidden, self.text_out, dropout=self.text_prob)

        # define the post_fusion layers
        self.post_fusion_dropout = nn.Dropout(p=self.post_fusion_prob)
        self.post_fusion_layer_1 = nn.Linear((self.text_out + 1) * (self.audio_hidden + 1), self.post_fusion_dim)
        self.post_fusion_layer_2 = nn.Linear(self.post_fusion_dim, self.post_fusion_dim)
        self.post_fusion_layer_3 = nn.Linear(self.post_fusion_dim, 1)

        # in TFN we are doing a regression with constrained output range: (-3, 3), hence we'll apply sigmoid to output
        # shrink it to (0, 1), and scale\shift it back to range (-3, 3)
#         self.output_range = Parameter(torch.FloatTensor([6]), requires_grad=False)
#         self.output_shift = Parameter(torch.FloatTensor([-3]), requires_grad=False)

    def forward(self, audio_x, text_x):
        '''
        Args:
            audio_x: tensor of shape (batch_size, sequence_len, audio_in)
            video_x: tensor of shape (batch_size, video_in)
            text_x: tensor of shape (batch_size, sequence_len, text_in)
        '''
        audio_h = self.audio_subnet(audio_x)
        text_h = self.text_subnet(text_x)
        batch_size = audio_h.data.shape[0]

        # next we perform "tensor fusion", which is essentially appending 1s to the tensors and take Kronecker product
        if audio_h.is_cuda:
            DTYPE = torch.cuda.FloatTensor
        else:
            DTYPE = torch.FloatTensor

        _audio_h = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), audio_h), dim=1)
        _text_h = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), text_h), dim=1)

        # _audio_h has shape (batch_size, audio_in + 1), _video_h has shape (batch_size, _video_in + 1)
        # we want to perform outer product between the two batch, hence we unsqueenze them to get
        # (batch_size, audio_in + 1, 1) X (batch_size, 1, video_in + 1)
        # fusion_tensor will have shape (batch_size, audio_in + 1, video_in + 1)
        fusion_tensor = torch.bmm(_audio_h.unsqueeze(2), _text_h.unsqueeze(1))
        
        # next we do kronecker product between fusion_tensor and _text_h. This is even trickier
        # we have to reshape the fusion tensor during the computation
        # in the end we don't keep the 3-D tensor, instead we flatten it
        fusion_tensor = fusion_tensor.view(batch_size, -1)

        post_fusion_dropped = self.post_fusion_dropout(fusion_tensor)
        post_fusion_y_1 = F.relu(self.post_fusion_layer_1(post_fusion_dropped))
        post_fusion_y_2 = F.relu(self.post_fusion_layer_2(post_fusion_y_1))
        output = self.post_fusion_layer_3(post_fusion_y_2)

        return output

In [8]:
import os 
os.chdir(os.path.pardir)
# load data from file 
import numpy as np 
save_file_name = ['fea_seq.npy', 'last_observation_seq.npy', 'label_seq.npy', 'masking_seq.npy',
                   'delta_seq.npy', 'train_valid_test_split.npy']
save_folder = 'data/raw/pol_temp_rh'
saved_arrays = []
for file_name in save_file_name:
    saved_arrays.append(np.load(os.path.join(save_folder, file_name)))
[fea_seq, last_observation_seq, label_seq, masking_seq, delta_seq, train_valid_test_split] = saved_arrays

In [9]:
# train-test-split 
train_index = [k for k in range(train_valid_test_split[0])]
dev_index = [k for k in range(train_valid_test_split[0], 
                               train_valid_test_split[0] + train_valid_test_split[1])]
test_index = [k for k in range(train_valid_test_split[0] + train_valid_test_split[1],
              train_valid_test_split[0] + train_valid_test_split[1] + train_valid_test_split[2])]

In [10]:
def get_array_by_index_range(nparray_list, label_array, index_range):
    '''
    nparray_list: list of nparrays to select according to index range 
    label_array: select the labels from label array
    '''
    # get non-na index
    non_na_index = []
    for index in index_range:
        if not np.isnan(label_array[index]):
            non_na_index.append(index)
    
    return [k[non_na_index] for k in nparray_list], label_array[non_na_index].reshape(-1)

In [11]:
# split set to train, test and dev sets 
# train set
[fea_train, last_train], label_train =  get_array_by_index_range([fea_seq,last_observation_seq], label_seq, train_index)
# dev set 
[fea_dev, last_dev], label_dev =  get_array_by_index_range([fea_seq, last_observation_seq], label_seq, dev_index)
# test set 
[fea_test, last_test], label_test =  get_array_by_index_range([fea_seq, last_observation_seq], label_seq, test_index)

In [12]:
def normalize_feature(fea_train, array_list):
    """
    array_list: [fea_dev, fea_test, last_train, last_dev, last_test] to normalize 
    """
    train_mean = np.nanmean(fea_train, axis=0)
    train_std = np.nanstd(fea_train, axis=0)
    def norm_arr(nparr):
        return(nparr - train_mean)/train_std
    return (norm_arr(fea_train), [norm_arr(k) for k in array_list])

In [13]:
fea_train, [fea_dev, fea_test, last_train, last_dev, last_test] = normalize_feature(fea_train,
                                                                                   [fea_dev, fea_test, 
                                                                                    last_train, last_dev,
                                                                                    last_test])

In [14]:
# record mean after normalization 
x_mean_aft_nor = np.nanmean(fea_train, axis=0)

In [27]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.nn.init import xavier_uniform, xavier_normal, orthogonal
import torch.optim as optim
import random
from torch.autograd import Variable, grad
from torch.optim.lr_scheduler import ReduceLROnPlateau
import math

In [55]:
fea_train.shape

(664, 7, 5)

In [56]:
def train_mfn(X_train, y_train, X_valid, y_valid, X_test, y_test, configs):
#     p = np.random.permutation(X_train.shape[0])
    # no shuffle, keep original order 
    # swap axes for back propagation 
#     def swap_axes(nparr):
#         return nparr.swapaxes(0,1)
#     X_train = swap_axes(X_train)
#     X_valid = swap_axes(X_valid)
#     X_test = swap_axes(X_test)
    
    # model parameters 
    input_size = X_train.shape[2]
    h = 128
    t = X_train.shape[1]
    output_dim = 1
    dropout = 0.5
    
    model = TFN(configs["input_dims"], configs["h_dims"], configs["text_out"],
               configs["dropouts"], configs["post_fusion_dim"])

    optimizer = optim.Adam(model.parameters(),lr=config["lr"])

    criterion = nn.MSELoss()
    device = torch.device('cpu')
    model = model.to(device)
    criterion = criterion.to(device)
    scheduler = ReduceLROnPlateau(optimizer, mode="min", patience=10, factor=0.5, verbose=True)
    
#     criterion = nn.L1Loss()
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     model = model.to(device)
#     criterion = criterion.to(device)
#     scheduler = ReduceLROnPlateau(optimizer,mode='min',patience=100,factor=0.5,verbose=True)

    def train(model, batchsize, X_train, y_train, optimizer, criterion):
        epoch_loss = 0
        model.train()
        total_n = X_train.shape[0]
        num_batches = math.ceil(total_n / batchsize)
        for batch in range(num_batches):
            start = batch*batchsize
            end = (batch+1)*batchsize
            optimizer.zero_grad()
            batch_X = torch.Tensor(X_train[start:end, :])
            batch_y = torch.Tensor(y_train[start:end])
            predictions = model.forward(batch_X).squeeze(1)
            loss = criterion(predictions, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        return epoch_loss / num_batches

    def evaluate(model, X_valid, y_valid, criterion):
        epoch_loss = 0
        model.eval()
        with torch.no_grad():
            batch_X = torch.Tensor(X_valid)
            batch_y = torch.Tensor(y_valid)
            predictions = model.forward(batch_X).squeeze(1)
            epoch_loss = criterion(predictions, batch_y).item()
        return epoch_loss

    def predict(model, X_test):
        epoch_loss = 0
        model.eval()
        with torch.no_grad():
            batch_X = torch.Tensor(X_test)
            predictions = model.forward(batch_X).squeeze(1)
            predictions = predictions.cpu().data.numpy()
        return predictions

    best_valid = 999999.0
    rand = random.randint(0,100000)
    print('epoch train_loss valid_loss')
    for epoch in range(config["num_epochs"]):
        train_loss = train(model, config["batchsize"], X_train, y_train, optimizer, criterion)
        valid_loss = evaluate(model, X_valid, y_valid, criterion)
        scheduler.step(valid_loss)
        if valid_loss <= best_valid:
            # save model
            best_valid = valid_loss
            print(epoch, train_loss, valid_loss, 'saving model')
            torch.save(model, 'models/temp_models/mfn_%d.pt' %rand)
        else:
            print(epoch, train_loss, valid_loss)

#     print 'model number is:', rand
    model = torch.load('models/temp_models/mfn_%d.pt' %rand)

    predictions = predict(model, X_test)
    mae = np.mean(np.absolute(predictions-y_test))
    print("mae: ", mae)
    mse = np.mean((predictions - y_test)**2)
    print("mse: ", mse)

In [66]:
class TextSubNet(nn.Module):
    '''
    The LSTM-based subnetwork that is used in TFN for text
    '''

    def __init__(self, in_size, hidden_size, out_size, num_layers=7, dropout=0.2, bidirectional=False):
        '''
        Args:
            in_size: input dimension
            hidden_size: hidden layer dimension
            num_layers: specify the number of layers of LSTMs.
            dropout: dropout probability
            bidirectional: specify usage of bidirectional LSTM
        Output:
            (return value in forward) a tensor of shape (batch_size, out_size)
        '''
        super(TextSubNet, self).__init__()
        self.rnn = nn.LSTM(in_size, hidden_size, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.linear_1 = nn.Linear(hidden_size, out_size)

    def forward(self, x):
        '''
        Args:
            x: tensor of shape (batch_size, sequence_len, in_size)
        '''
        _, final_states = self.rnn(x)
#         print("shape of the final_states")
#         print(final_states[0].size())
        h = self.dropout(final_states[0][-1].squeeze())
        y_1 = self.linear_1(h)
        return y_1


class TFN(nn.Module):
    '''
    Implements the Tensor Fusion Networks for multimodal sentiment analysis as is described in:
    Zadeh, Amir, et al. "Tensor fusion network for multimodal sentiment analysis." EMNLP 2017 Oral.
    '''

    def __init__(self, input_dims, hidden_dims, text_out, dropouts, post_fusion_dim):
        '''
        Args:
            input_dims - a length-3 tuple, contains (audio_dim, video_dim, text_dim)
            hidden_dims - another length-3 tuple, similar to input_dims
            text_out - int, specifying the resulting dimensions of the text subnetwork
            dropouts - a length-4 tuple, contains (audio_dropout, video_dropout, text_dropout, post_fusion_dropout)
            post_fusion_dim - int, specifying the size of the sub-networks after tensorfusion
        Output:
            (return value in forward) a scalar value between -3 and 3
        '''
        super(TFN, self).__init__()

        # dimensions are specified in the order of audio, video and text
        self.audio_in = input_dims[0]
        self.text_in = input_dims[1]

        self.audio_hidden = hidden_dims[0]
        self.text_hidden = hidden_dims[1]
        self.audio_out = text_out[0]
        self.text_out= text_out[1]
        self.post_fusion_dim = post_fusion_dim

        self.audio_prob = dropouts[0]
        self.text_prob = dropouts[1]
        self.post_fusion_prob = dropouts[2]

        # define the pre-fusion subnetworks
        self.audio_subnet = TextSubNet(self.audio_in, self.audio_hidden, self.audio_out, dropout=self.audio_prob)
        self.text_subnet = TextSubNet(self.text_in, self.text_hidden, self.text_out, dropout=self.text_prob)

        # define the post_fusion layers
        self.post_fusion_dropout = nn.Dropout(p=self.post_fusion_prob)
        self.post_fusion_layer_1 = nn.Linear((self.text_out + 1) * (self.audio_hidden + 1), self.post_fusion_dim)
        self.post_fusion_layer_2 = nn.Linear(self.post_fusion_dim, self.post_fusion_dim)
        self.post_fusion_layer_3 = nn.Linear(self.post_fusion_dim, 1)

        # in TFN we are doing a regression with constrained output range: (-3, 3), hence we'll apply sigmoid to output
        # shrink it to (0, 1), and scale\shift it back to range (-3, 3)
#         self.output_range = Parameter(torch.FloatTensor([6]), requires_grad=False)
#         self.output_shift = Parameter(torch.FloatTensor([-3]), requires_grad=False)

    def forward(self, input_x):
        '''
        Args:
            audio_x: tensor of shape (batch_size, sequence_len, audio_in)
            text_x: tensor of shape (batch_size, sequence_len, text_in)
        '''
        audio_x = input_x[:,:,:self.audio_in]
        text_x = input_x[:,:,self.audio_in:self.audio_in+self.text_in]
#         print(audio_x.size())
        audio_h = self.audio_subnet(audio_x)
#         print(audio_h.size())
        text_h = self.text_subnet(text_x)
        batch_size = audio_h.data.shape[0]

        # next we perform "tensor fusion", which is essentially appending 1s to the tensors and take Kronecker product
        if audio_h.is_cuda:
            DTYPE = torch.cuda.FloatTensor
        else:
            DTYPE = torch.FloatTensor

#         print("the size of audio_h")
        _audio_h = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), audio_h), dim=1)
        _text_h = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), text_h), dim=1)

        # _audio_h has shape (batch_size, audio_in + 1), _video_h has shape (batch_size, _video_in + 1)
        # we want to perform outer product between the two batch, hence we unsqueenze them to get
        # (batch_size, audio_in + 1, 1) X (batch_size, 1, video_in + 1)
        # fusion_tensor will have shape (batch_size, audio_in + 1, video_in + 1)
        fusion_tensor = torch.bmm(_audio_h.unsqueeze(2), _text_h.unsqueeze(1))
        
        # next we do kronecker product between fusion_tensor and _text_h. This is even trickier
        # we have to reshape the fusion tensor during the computation
        # in the end we don't keep the 3-D tensor, instead we flatten it
        fusion_tensor = fusion_tensor.view(batch_size, -1)

        post_fusion_dropped = self.post_fusion_dropout(fusion_tensor)
        post_fusion_y_1 = F.relu(self.post_fusion_layer_1(post_fusion_dropped))
        post_fusion_y_2 = F.relu(self.post_fusion_layer_2(post_fusion_y_1))
        output = self.post_fusion_layer_3(post_fusion_y_2)

        return output

In [70]:
config = dict()
config["input_dims"] = [1, 4]
hl = 128
ha = 128
config["h_dims"] = [hl, ha]
config["text_out"] = (hl, ha)
config["dropouts"] = (0.7, 0.7, 0.7)
config["post_fusion_dim"] = hl
config["batchsize"] = hl
config["num_epochs"] = 50
config["lr"] = 0.001

seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_mfn(last_train, label_train, last_dev, label_dev, last_test, label_test, config)

epoch train_loss valid_loss
0 151.12362798055014 93.65888214111328 saving model
1 86.34914620717366 44.03561019897461 saving model
2 42.66665426890055 20.319063186645508 saving model
3 44.92241954803467 17.869155883789062 saving model
4 38.23664093017578 18.1877384185791
5 36.67391268412272 17.75365447998047 saving model
6 37.218139012654625 18.480466842651367
7 38.26812203725179 17.94582748413086
8 37.21442731221517 18.217092514038086
9 35.80371252695719 18.401939392089844
10 35.916890144348145 18.30585289001465
11 36.411628087361656 18.577512741088867
12 35.58442465464274 18.163572311401367
13 37.7751522064209 18.312070846557617
14 37.28560988108317 18.125551223754883
15 35.509450912475586 18.287067413330078
Epoch    17: reducing learning rate of group 0 to 5.0000e-04.
16 36.22530206044515 18.38686180114746
17 34.77285925547282 19.009708404541016
18 34.09031836191813 18.618375778198242
19 35.99804814656576 19.178516387939453
20 33.9280424118042 18.831621170043945
21 35.46969827016195

In [69]:
config = dict()
config["input_dims"] = [1, 4]
hl = 32
ha = 32
config["h_dims"] = [hl, ha]
config["text_out"] = (hl, ha)
config["dropouts"] = (0.5, 0.5, 0.5)
config["post_fusion_dim"] = hl
config["batchsize"] = hl
config["num_epochs"] = 50
config["lr"] = 0.0001

seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_mfn(last_train, label_train, last_dev, label_dev, last_test, label_test, config)

epoch train_loss valid_loss
0 141.4877210344587 110.12069702148438 saving model
1 140.76679011753626 109.37950134277344 saving model
2 139.81058029901413 108.35891723632812 saving model
3 138.21675836472284 106.33660125732422 saving model
4 134.8403607323056 101.5842514038086 saving model
5 126.28743816557385 88.8018569946289 saving model
6 102.01781717936198 50.39952087402344 saving model
7 50.85084006899879 21.110916137695312 saving model
8 32.86671420506069 19.08951187133789 saving model
9 31.231712068830216 18.765867233276367 saving model
10 32.55115191141764 18.407224655151367 saving model
11 29.867077373322985 19.61494255065918
12 31.41922037942069 18.98175811767578
13 31.81729452950614 19.254167556762695
14 31.29679570879255 18.660823822021484
15 31.533252034868514 18.63995933532715
16 30.211222194489977 19.693208694458008
17 31.112482252575102 19.11903190612793
18 30.11747269403367 19.199750900268555
19 29.940886542910622 19.352855682373047
20 31.118786721002486 19.423049926757

In [68]:
config = dict()
config["input_dims"] = [1, 4]
hl = 64
ha = 64
config["h_dims"] = [hl, ha]
config["text_out"] = (hl, ha)
config["dropouts"] = (0.7, 0.7, 0.7)
config["post_fusion_dim"] = hl
config["batchsize"] = hl
config["num_epochs"] = 50
config["lr"] = 0.0001

seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
train_mfn(last_train, label_train, last_dev, label_dev, last_test, label_test, config)

epoch train_loss valid_loss
0 147.58606234463778 111.87670135498047 saving model
1 146.96226570822975 111.15380096435547 saving model
2 145.9830884066495 110.06047058105469 saving model
3 144.4334820834073 108.15937042236328 saving model
4 141.6655606356534 104.71897888183594 saving model
5 136.27631933038884 97.34671783447266 saving model
6 124.28441897305575 80.16642761230469 saving model
7 96.32560244473544 41.57260513305664 saving model
8 53.197735873135656 26.51873016357422 saving model
9 39.153928236527875 19.488378524780273 saving model
10 34.927127838134766 17.92530632019043 saving model
11 36.77009235728871 20.452730178833008
12 36.341758728027344 19.849782943725586
13 32.72726882587779 18.940858840942383
14 34.71786932511763 18.81072235107422
15 35.562731916254215 18.986814498901367
16 37.82662374323065 19.108989715576172
17 33.94412508877841 19.782392501831055
18 35.02877270091664 19.65970230102539
19 34.42843922701749 18.78156852722168
20 34.22343652898615 18.54723739624023

In [58]:
input_dims, hidden_dims, text_out, dropouts, post_fusion_dim

NameError: name 'input_dims' is not defined

In [None]:
model = TFN(input_dims, (32, 32), (32, 32), (0.3, 0.3, 0.3), 32)

In [9]:
a = torch.ones([3])

In [3]:
import torch

In [4]:
torch.FloatTensor([-3])

tensor([-3.])

In [6]:
torch.FloatTensor([6])

tensor([6.])

In [11]:
b = torch.ones([3])

In [12]:
a*b

tensor([1., 1., 1.])