In [1]:
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import torch
import numpy as np
from model import SubNet, TextSubNet, TFN, LMF, AdaptiveRankFusion
from torch.distributions.half_cauchy import HalfCauchy
from torch.distributions.normal import Normal
from datasets import get_cmu_mosi_dataset

In [2]:
train_set, valid_set, test_set = get_cmu_mosi_dataset(binary=True)

In [12]:
batch_size = 32
epochs = 50
# settings from https://github.com/Justin1904/TensorFusionNetworks/blob/master/train.py
input_dims = (5, 20, 300)
hidden_dims = (4, 16, 128)
text_out = 64
dropouts = (0.3, 0.3, 0.3, 0.3)
post_fusion_dim = 32
model = TFN(input_dims, hidden_dims, text_out, dropouts, post_fusion_dim)



In [6]:
batch_size=32
train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_set, batch_size=len(valid_set))
test_dataloader = DataLoader(test_set, batch_size=len(test_set))

In [14]:
criterion = nn.L1Loss()
optimizer = optim.Adam(list(model.parameters())[2:])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, verbose=True)
for e in range(epochs):
    print('Epoch {}'.format(e))
    train_loss = 0.0
    model.train()
    for text, audio, vision, label in train_dataloader:
        model.zero_grad()
        output = model(audio, vision, text)
        loss = criterion(output, label)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print(train_loss)
    
    model.eval()
    for text, audio, vision, label in valid_dataloader:
        output = model(audio, vision, text)
        valid_loss = criterion(output, label).item()
    scheduler.step(valid_loss)
    print(valid_loss)

Epoch 0




51.58957976102829
1.1529542207717896
Epoch 1
39.98873919248581
1.1220371723175049
Epoch 2
36.850977063179016
1.0471466779708862
Epoch 3
33.12352240085602
1.015275239944458
Epoch 4
31.333762228488922
1.1576157808303833
Epoch 5
31.159377694129944
1.030088186264038
Epoch 6
28.39326113462448
Epoch     7: reducing learning rate of group 0 to 1.0000e-04.
1.0263044834136963
Epoch 7
26.078719526529312
0.9703226685523987
Epoch 8
25.163460284471512
0.96832275390625
Epoch 9
24.169587582349777
0.9733483195304871
Epoch 10
24.36491909623146
0.984216034412384
Epoch 11
23.912283927202225
Epoch    12: reducing learning rate of group 0 to 1.0000e-05.
0.9718354940414429
Epoch 12
23.52481299638748
0.9737423658370972
Epoch 13
23.00118261575699
0.9700931906700134
Epoch 14
23.31677833199501
0.9659343957901001
Epoch 15
24.225109219551086
0.9717851877212524
Epoch 16
23.423350244760513
0.9671643972396851
Epoch 17
23.49833881855011
Epoch    18: reducing learning rate of group 0 to 1.0000e-06.
0.9698118567466736


In [19]:
rank = 4
output_dim = 1
model = LMF(input_dims, hidden_dims, text_out, dropouts, output_dim, rank, use_softmax=False)

  xavier_normal(self.audio_factor)
  xavier_normal(self.video_factor)
  xavier_normal(self.text_factor)
  xavier_normal(self.fusion_weights)


In [21]:
criterion = nn.L1Loss()
factors = list(model.parameters())[:3]
other = list(model.parameters())[3:]
factor_lr = 0.0005
lr = 0.001
optimizer = optim.Adam([{"params": factors, "lr": factor_lr}, {"params": other, "lr": lr}])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, verbose=True)
for e in range(epochs):
    print('Epoch {}'.format(e))
    train_loss = 0.0
    model.train()
    for text, audio, vision, label in train_dataloader:
        model.zero_grad()
        output = model(audio, vision, text)
        loss = criterion(output, label)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print(train_loss)
    
    model.eval()
    for text, audio, vision, label in valid_dataloader:
        output = model(audio, vision, text)
        valid_loss = criterion(output, label).item()
    scheduler.step(valid_loss)
    print(valid_loss)

Epoch 0
22.86905947327614
1.000284194946289
Epoch 1
20.729330733418465
1.0048730373382568
Epoch 2
20.46320030093193
0.9808865785598755
Epoch 3
18.937021389603615
0.9868036508560181
Epoch 4
18.58921778202057
1.0054031610488892
Epoch 5
17.24204032123089
Epoch     6: reducing learning rate of group 0 to 5.0000e-05.
Epoch     6: reducing learning rate of group 1 to 1.0000e-04.
1.012037992477417
Epoch 6
16.150738149881363
0.9593955874443054
Epoch 7
14.970886915922165
0.9665323495864868
Epoch 8
14.458706766366959
0.9764308929443359
Epoch 9
14.091897323727608
Epoch    10: reducing learning rate of group 0 to 5.0000e-06.
Epoch    10: reducing learning rate of group 1 to 1.0000e-05.
0.9663721323013306
Epoch 10
13.62071168422699
0.965540885925293
Epoch 11
13.827472314238548
0.9655867218971252
Epoch 12
13.69283601641655
Epoch    13: reducing learning rate of group 0 to 5.0000e-07.
Epoch    13: reducing learning rate of group 1 to 1.0000e-06.
0.9673418998718262
Epoch 13
13.770007252693176
0.967118

In [8]:
class AdaptiveRankFusionLayer(nn.Module):

    def __init__(self, input_sizes, output_size, max_rank=10, eta=0.01):
        '''
        args:
            input_sizes: a tuple of ints, (input_size_1, input_size_2, ..., input_size_M)
            output_sizes: an int, output size of the fusion layer
            dropout: a float, dropout probablity after fusion
            max_rank: an int, maximum rank for the CP decomposition
            eta: a float, hyperparameter for rank parameter distribution
        '''
        super(AdaptiveRankFusionLayer, self).__init__()

        self.input_sizes = input_sizes
        self.output_size = output_size
        self.max_rank = max_rank
        self.eta = eta

        # CP decomposition factors for the weight tensor
        self.factors = nn.ParameterList([nn.init.xavier_normal_(nn.Parameter(torch.empty(s, max_rank))) 
                                        for s in input_sizes+(output_size,)])
        # rank parameter and its distribution for adaptive rank
        self.rank_param = nn.Parameter(torch.rand((max_rank,)))
        self.rank_param_dist = HalfCauchy(eta)

    def forward(self, inputs):
        '''
        args:
            inputs: a list of vectors, (input_1, input_2, ..., input_M)
        return:
            y = [(input_1 @ factor_1) (input_2 @ factor_2) ... (input_M @ factor_M)] @ factor_{M+1}.T
        '''

        y = 1.0
        for i, x in enumerate(inputs):
            y = y * (x @ self.factors[i])
        y = y @ self.factors[-1].T

        return y

    def get_log_prior(self):
        '''
        return:
            log_prior = log[HalfCauchy(rank_param | eta)] + log[Normal(factor_1 | 0, rank_param)]
                    + log[Normal(factor_2 | 0, rank_param)] + ... + log[Normal(factor_{M+1} | 0, rank_param)]
        '''
        # clamp rank_param because <=0 is undefined 
        clamped_rank_param = self.rank_param.clamp(0.01)
        log_prior = torch.sum(self.rank_param_dist.log_prob(clamped_rank_param))

        # 0 mean normal distribution for the factors
        factor_dist = Normal(0, clamped_rank_param)
        for factor in self.factors:
            log_prior = log_prior + torch.sum(factor_dist.log_prob(factor))
        
        return log_prior


In [28]:
class AdaptiveRankFusion(nn.Module):

    def __init__(self, input_sizes, hidden_sizes, dropouts, output_size, max_rank=10, eta=0.01):
        '''
        args:
            input_sizes: a tuple of ints, (audio_in, video_in, ... text_in)
            hidden_sizes: a tuple of ints, (audio_hidden, video_hidden, ... text_hidden)
            dropouts: a tuple of floats, (dropout_1, dropout_2, ..., dropout_M, post_fusion_dropout)
            output_size: an int, output size for fusion layer
            max_rank: an int, maximum rank for the CP decomposition
        '''
        super(AdaptiveRankFusion, self).__init__()
        
        # define the pre-fusion subnetworks
        self.audio_subnet = SubNet(input_sizes[0], hidden_sizes[0], dropouts[0])
        self.video_subnet = SubNet(input_sizes[1], hidden_sizes[1], dropouts[1])
        self.text_subnet = TextSubNet(input_sizes[2], hidden_sizes[2], hidden_sizes[2]//2, dropout=dropouts[2])
        
        fusion_input_sizes = (hidden_sizes[0]+1, hidden_sizes[1]+1, hidden_sizes[2]//2+1)
        # define fusion layer
        self.fusion_layer = AdaptiveRankFusionLayer(input_sizes=fusion_input_sizes,
                                                    output_size=output_size,
                                                    max_rank=max_rank,
                                                    eta=eta)
        self.post_fusion_dropout = nn.Dropout(dropouts[-1])

    def forward(self, audio_x, video_x, text_x):

        audio_h = self.audio_subnet(audio_x)
        video_h = self.video_subnet(video_x)
        text_h = self.text_subnet(text_x)

        batch_size = audio_h.shape[0]

        audio_h = torch.cat((audio_h, torch.ones((batch_size, 1))), dim=1)
        video_h = torch.cat((video_h, torch.ones((batch_size, 1))), dim=1)
        text_h = torch.cat((text_h, torch.ones((batch_size, 1))), dim=1)

        output = self.fusion_layer([audio_h, video_h, text_h])
        output = self.post_fusion_dropout(output)
        
        return output
        

In [36]:
input_dims = (5, 20, 300)
hidden_dims = (4, 16, 128)
output_size = (1)
dropouts = (0.2, 0.2, 0.2, 0.2)
model = AdaptiveRankFusion(input_dims, hidden_dims, dropouts, output_size)




In [37]:
criterion = nn.BCEWithLogitsLoss()
factor_lr = 0.0001
lr = 0.001
subnet_params = list(model.audio_subnet.parameters()) + list(model.video_subnet.parameters()) + list(model.text_subnet.parameters())
optimizer = optim.Adam([{"params": subnet_params, "lr": lr}, 
                        {"params": list(model.fusion_layer.parameters()), "lr": factor_lr}])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, verbose=True)
epochs = 50
for e in range(epochs):
    print('Epoch {}'.format(e))
    train_loss = 0.0
    model.train()
    for text, audio, vision, label in train_dataloader:
        model.zero_grad()
        output = model(audio, vision, text)
        loss = criterion(output, label) - 0.1 * model.fusion_layer.get_log_prior()
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print(train_loss)
    
    model.eval()
    for text, audio, vision, label in valid_dataloader:
        output = model(audio, vision, text)
        valid_loss = criterion(output, label).item()
    scheduler.step(valid_loss)
    print(valid_loss)

Epoch 0
1547.292766571045
0.3096248209476471
Epoch 1
1292.0612697601318
-1.2837207317352295
Epoch 2
721.7102184295654
-10.459827423095703
Epoch 3
-954.6527261734009
-42.06126403808594
Epoch 4
-12695.144991874695
-273.0089111328125
Epoch 5
-71993.96398925781
-930.1858520507812
Epoch 6
-273316.80923461914
-6793.36376953125
Epoch 7
-1391102.5603027344
-31626.0234375
Epoch 8
-9069991.73803711
-377204.4375
Epoch 9


KeyboardInterrupt: 

In [35]:
model.fusion_layer.rank_param

Parameter containing:
tensor([0.6895, 0.4420, 0.6921, 0.6713, 0.6768, 0.3938, 0.4589, 0.5104, 0.4994,
        0.4866], requires_grad=True)

In [None]:
criterion = nn.L1Loss()
factors = list(model.parameters())[:3]
other = list(model.parameters())[3:]
factor_lr = 0.0005
lr = 0.001
optimizer = optim.Adam([{"params": factors, "lr": factor_lr}, {"params": other, "lr": lr}])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, verbose=True)
for e in range(epochs):
    print('Epoch {}'.format(e))
    train_loss = 0.0
    model.train()
    for text, audio, vision, label in train_dataloader:
        model.zero_grad()
        output = model(audio, vision, text)
        loss = criterion(output, label)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print(train_loss)
    
    model.eval()
    for text, audio, vision, label in valid_dataloader:
        output = model(audio, vision, text)
        valid_loss = criterion(output, label).item()
    scheduler.step(valid_loss)
    print(valid_loss)

Epoch 0
22.86905947327614
1.000284194946289
Epoch 1
20.729330733418465
1.0048730373382568
Epoch 2
20.46320030093193
0.9808865785598755
Epoch 3
18.937021389603615
0.9868036508560181
Epoch 4
18.58921778202057
1.0054031610488892
Epoch 5
17.24204032123089
Epoch     6: reducing learning rate of group 0 to 5.0000e-05.
Epoch     6: reducing learning rate of group 1 to 1.0000e-04.
1.012037992477417
Epoch 6
16.150738149881363
0.9593955874443054
Epoch 7
14.970886915922165
0.9665323495864868
Epoch 8
14.458706766366959
0.9764308929443359
Epoch 9
14.091897323727608
Epoch    10: reducing learning rate of group 0 to 5.0000e-06.
Epoch    10: reducing learning rate of group 1 to 1.0000e-05.
0.9663721323013306
Epoch 10
13.62071168422699
0.965540885925293
Epoch 11
13.827472314238548
0.9655867218971252
Epoch 12
13.69283601641655
Epoch    13: reducing learning rate of group 0 to 5.0000e-07.
Epoch    13: reducing learning rate of group 1 to 1.0000e-06.
0.9673418998718262
Epoch 13
13.770007252693176
0.967118

In [14]:
input_sizes = (64, 64, 128)
output_size = (1)

layer = AdaptiveRankFusionLayer(input_sizes, output_size)

In [15]:
layer.factors

ParameterList(
    (0): Parameter containing: [torch.FloatTensor of size 64x10]
    (1): Parameter containing: [torch.FloatTensor of size 64x10]
    (2): Parameter containing: [torch.FloatTensor of size 128x10]
    (3): Parameter containing: [torch.FloatTensor of size 1x10]
)

In [16]:
layer.get_log_prior()

tensor(-329.0527, grad_fn=<AddBackward0>)