In [2]:
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import torch
from model import TFN, LMF, AdaptiveRankFusion
from datasets import get_cmu_mosi_dataset

In [3]:
train_set, valid_set, test_set = get_cmu_mosi_dataset(binary=True)

In [4]:
batch_size = 32
epochs = 50
# settings from https://github.com/Justin1904/TensorFusionNetworks/blob/master/train.py
input_dims = (5, 20, 300)
hidden_dims = (4, 16, 128)
text_out = 64
dropouts = (0.3, 0.3, 0.3, 0.3)
post_fusion_dim = 32
model = TFN(input_dims, hidden_dims, text_out, dropouts, post_fusion_dim)



In [5]:
batch_size=32
train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_set, batch_size=len(valid_set))
test_dataloader = DataLoader(test_set, batch_size=len(test_set))

In [5]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(list(model.parameters())[2:])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, verbose=True)
for e in range(epochs):
    print('Epoch {}'.format(e))
    train_loss = 0.0
    model.train()
    for text, audio, vision, label in train_dataloader:
        model.zero_grad()
        output = model(audio, vision, text)
        loss = criterion(output, label)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print(train_loss)
    
    model.eval()
    for text, audio, vision, label in valid_dataloader:
        output = model(audio, vision, text)
        valid_loss = criterion(output, label).item()
    scheduler.step(valid_loss)
    print(valid_loss)

Epoch 0




27.893936038017273
0.6493960022926331
Epoch 1
23.122301280498505
0.5518956184387207
Epoch 2
19.511977642774582
0.5078473091125488
Epoch 3
17.557215571403503
0.536212146282196
Epoch 4
15.52185334265232
0.6276272535324097
Epoch 5
14.136798739433289
Epoch     6: reducing learning rate of group 0 to 1.0000e-04.
0.5497230291366577
Epoch 6
10.833918131887913
0.5712649822235107
Epoch 7
10.596223138272762
0.5716428160667419
Epoch 8
9.911804616451263
Epoch     9: reducing learning rate of group 0 to 1.0000e-05.
0.5686128735542297
Epoch 9
9.994131349027157
0.5700634717941284
Epoch 10
8.971643168479204
0.5718305110931396
Epoch 11
9.061752662062645
Epoch    12: reducing learning rate of group 0 to 1.0000e-06.
0.5769819617271423
Epoch 12
8.987968303263187
0.5768039226531982
Epoch 13
9.079888701438904
0.5773832201957703
Epoch 14
9.024409659206867
Epoch    15: reducing learning rate of group 0 to 1.0000e-07.
0.5771111249923706
Epoch 15
9.136607177555561
0.5782935619354248
Epoch 16
9.564129255712032
0

In [11]:
rank = 4
output_dim = 1
model = LMF(input_dims, hidden_dims, text_out, dropouts, output_dim, rank, use_softmax=False)

  xavier_normal(self.audio_factor)
  xavier_normal(self.video_factor)
  xavier_normal(self.text_factor)
  xavier_normal(self.fusion_weights)


In [12]:
criterion = nn.BCEWithLogitsLoss()
factors = list(model.parameters())[:3]
other = list(model.parameters())[3:]
factor_lr = 0.0005
lr = 0.001
optimizer = optim.Adam([{"params": factors, "lr": factor_lr}, {"params": other, "lr": lr}])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, verbose=True)
for e in range(epochs):
    print('Epoch {}'.format(e))
    train_loss = 0.0
    model.train()
    for text, audio, vision, label in train_dataloader:
        model.zero_grad()
        output = model(audio, vision, text)
        loss = criterion(output, label)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print(train_loss)
    
    model.eval()
    for text, audio, vision, label in valid_dataloader:
        output = model(audio, vision, text)
        valid_loss = criterion(output, label).item()
    scheduler.step(valid_loss)
    print(valid_loss)

Epoch 0
27.718714237213135
0.611574649810791
Epoch 1
24.115648239850998
0.5515691637992859
Epoch 2
21.143627643585205
0.5160359740257263
Epoch 3
19.03038054704666
0.4720185101032257
Epoch 4
16.445132166147232
0.4783540666103363
Epoch 5
14.549745231866837
0.47831130027770996
Epoch 6
13.09028309583664
Epoch     7: reducing learning rate of group 0 to 5.0000e-05.
Epoch     7: reducing learning rate of group 1 to 1.0000e-04.
0.5302578210830688
Epoch 7
10.462743245065212
0.5355609059333801
Epoch 8
8.853222027420998
0.5526226162910461
Epoch 9
7.827011771500111
Epoch    10: reducing learning rate of group 0 to 5.0000e-06.
Epoch    10: reducing learning rate of group 1 to 1.0000e-05.
0.5818965435028076
Epoch 10
7.516419380903244
0.5899335145950317
Epoch 11
7.578511625528336
0.5893776416778564
Epoch 12
7.438332632184029
Epoch    13: reducing learning rate of group 0 to 5.0000e-07.
Epoch    13: reducing learning rate of group 1 to 1.0000e-06.
0.595984160900116
Epoch 13
7.352630756795406
0.594329

In [25]:
model = AdaptiveRankFusion(input_dims, hidden_dims, dropouts, output_size=1)



In [27]:
list(model.fusion_layer.rank_param)

[tensor(0.0361, grad_fn=<UnbindBackward>),
 tensor(0.4688, grad_fn=<UnbindBackward>),
 tensor(0.1688, grad_fn=<UnbindBackward>),
 tensor(0.7413, grad_fn=<UnbindBackward>),
 tensor(0.5441, grad_fn=<UnbindBackward>),
 tensor(0.8754, grad_fn=<UnbindBackward>),
 tensor(0.3675, grad_fn=<UnbindBackward>),
 tensor(0.9385, grad_fn=<UnbindBackward>),
 tensor(0.8079, grad_fn=<UnbindBackward>),
 tensor(0.3892, grad_fn=<UnbindBackward>)]

In [28]:
criterion = nn.BCEWithLogitsLoss()
factor_lr = 0.0005
lr = 0.001
subnet_params = list(model.audio_subnet.parameters()) + list(model.video_subnet.parameters()) + list(model.text_subnet.parameters())
optimizer = optim.Adam([{"params": subnet_params, "lr": lr}, 
                        {"params": list(model.fusion_layer.parameters()), "lr": factor_lr}])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, verbose=True)
epochs = 300
for e in range(epochs):
    print('Epoch {}'.format(e))
    train_loss = 0.0
    model.train()
    print(model.fusion_layer.rank_param)
    for text, audio, vision, label in train_dataloader:
        model.zero_grad()
        output = model(audio, vision, text)
        loss = criterion(output, label) - 0.1 * model.fusion_layer.get_log_prior()
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print(train_loss)
    model.eval()
    for text, audio, vision, label in valid_dataloader:
        output = model(audio, vision, text)
        valid_loss = criterion(output, label).item()
    print(valid_loss)
    scheduler.step(valid_loss)

Epoch 0
Parameter containing:
tensor([0.0361, 0.4688, 0.1688, 0.7413, 0.5441, 0.8754, 0.3675, 0.9385, 0.8079,
        0.3892], requires_grad=True)
3417.6028175354004
0.6469851136207581
Epoch 1
Parameter containing:
tensor([0.0531, 0.4481, 0.1869, 0.7207, 0.5235, 0.8548, 0.3468, 0.9180, 0.7873,
        0.3685], requires_grad=True)
2008.7422065734863
0.6123189926147461
Epoch 2
Parameter containing:
tensor([0.0626, 0.4270, 0.1957, 0.6998, 0.5025, 0.8340, 0.3255, 0.8972, 0.7665,
        0.3472], requires_grad=True)
1478.4829845428467
0.582213819026947
Epoch 3
Parameter containing:
tensor([0.0689, 0.4053, 0.1962, 0.6786, 0.4809, 0.8129, 0.3034, 0.8761, 0.7453,
        0.3251], requires_grad=True)
1128.4907855987549
0.5360950231552124
Epoch 4
Parameter containing:
tensor([0.0737, 0.3829, 0.1906, 0.6570, 0.4588, 0.7915, 0.2804, 0.8548, 0.7238,
        0.3021], requires_grad=True)
839.8427906036377
0.5471928119659424
Epoch 5
Parameter containing:
tensor([0.0775, 0.3598, 0.1809, 0.6350, 0.4362,

KeyboardInterrupt: 

In [16]:
model.fusion_layer.rank_param

Parameter containing:
tensor([0.0560, 0.3947, 0.4498, 0.4022, 0.3426, 0.5294, 0.5075, 0.6149, 0.6144,
        0.5896], requires_grad=True)