In [1]:
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import torch
from tensor_fusion.fusion_model import TFN, LMF, AdaptiveRankFusion
from tensor_fusion.dataset import get_cmu_mosi_dataset
from tensor_fusion.model import AdaptiveRankFactorizedTextSubNet, SubNet
from tensor_fusion.fusion_layer import AdaptiveRankFusionLayer



In [2]:
device='cuda'
DTYPE=torch.float32
train_set, valid_set, test_set = get_cmu_mosi_dataset(binary=True, device=device, dtype=DTYPE)

In [3]:
class AdaptiveRankFusion_with_AdaptiveRankFactorizedTextSubNet(nn.Module):

    def __init__(self, input_sizes, hidden_sizes, dropouts, output_size, max_rank=10, 
                 prior_type='half_cauchy', eta=None,
                 device=None, dtype=None):
        '''
        args:
            input_sizes: a tuple of ints, (audio_in, video_in, ... text_in)
            hidden_sizes: a tuple of ints, (audio_hidden, video_hidden, ... text_hidden)
            dropouts: a tuple of floats, (dropout_1, dropout_2, ..., dropout_M, post_fusion_dropout)
            output_size: an int, output size for fusion layer
            max_rank: an int, maximum rank for the CP decomposition
        '''
        super().__init__()
        
        # define the pre-fusion subnetworks
        self.audio_subnet = SubNet(input_sizes[0], hidden_sizes[0], dropouts[0], device=device, dtype=dtype)
        self.video_subnet = SubNet(input_sizes[1], hidden_sizes[1], dropouts[1], device=device, dtype=dtype)
        self.text_subnet = AdaptiveRankFactorizedTextSubNet(input_sizes[2], hidden_sizes[2], hidden_sizes[2]//2, dropout=dropouts[2],
                                                            prior_type='half_cauchy', eta=0.01,
                                                            device=device, dtype=dtype)
        
        fusion_input_sizes = (hidden_sizes[0]+1, hidden_sizes[1]+1, hidden_sizes[2]//2+1)
        # define fusion layer
        self.fusion_layer = AdaptiveRankFusionLayer(input_sizes=fusion_input_sizes,
                                                    output_size=output_size,
                                                    max_rank=max_rank,
                                                    prior_type=prior_type,
                                                    eta=eta,
                                                    device=device,
                                                    dtype=dtype)

        self.post_fusion_dropout = nn.Dropout(dropouts[-1])

    def forward(self, audio_x, video_x, text_x):

        audio_h = self.audio_subnet(audio_x)
        video_h = self.video_subnet(video_x)
        text_h = self.text_subnet(text_x)

        batch_size = audio_h.shape[0]
        device = audio_h.device
        dtype = audio_h.dtype

        audio_h = torch.cat((audio_h, torch.ones((batch_size, 1), device=device, dtype=dtype)), dim=1)
        video_h = torch.cat((video_h, torch.ones((batch_size, 1), device=device, dtype=dtype)), dim=1)
        text_h = torch.cat((text_h, torch.ones((batch_size, 1), device=device, dtype=dtype)), dim=1)

        output = self.fusion_layer([audio_h, video_h, text_h])
        output = self.post_fusion_dropout(output)
        
        return output

In [4]:
epochs = 50
input_dims = (5, 20, 300)
hidden_dims = (4, 16, 128)
text_out = 64
dropouts = (0.3, 0.3, 0.3, 0.3)
rank = 4
output_dim = 1
model = AdaptiveRankFusion_with_AdaptiveRankFactorizedTextSubNet(input_dims, hidden_dims, dropouts, output_size=1, max_rank=10, prior_type='half_cauchy', eta=0.01, device='cuda', dtype=torch.float32)

In [5]:
batch_size=32
train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_set, batch_size=len(valid_set))
test_dataloader = DataLoader(test_set, batch_size=len(test_set))

In [6]:
criterion = nn.BCEWithLogitsLoss()
factor_lr = 0.0005
lr = 0.001
subnet_params = list(model.audio_subnet.parameters()) + list(model.video_subnet.parameters()) + list(model.text_subnet.parameters())
optimizer = optim.Adam([{"params": subnet_params, "lr": lr}, 
                        {"params": list(model.fusion_layer.parameters()), "lr": factor_lr}])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, verbose=True)
epochs = 300
for e in range(epochs):
    print('Epoch {}'.format(e))
    train_loss = 0.0
    model.train()
    print(model.fusion_layer.weight_tensor.rank_parameter)
    for text, audio, vision, label in train_dataloader:
        model.zero_grad()
        output = model(audio, vision, text)
        loss = criterion(output, label) - 0.0001 * model.fusion_layer.get_log_prior()
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print(train_loss)
    model.eval()
    for text, audio, vision, label in valid_dataloader:
        output = model(audio, vision, text)
        valid_loss = criterion(output, label).item()
    print(valid_loss)
    scheduler.step(valid_loss)

Epoch 0
Parameter containing:
tensor([[0.2673, 0.2653, 0.2732, 0.2667, 0.2371, 0.2714, 0.2846, 0.2828, 0.2606,
         0.3198]], device='cuda:0', requires_grad=True)
28.854748487472534
0.6867510080337524
Epoch 1
Parameter containing:
tensor([[0.2445, 0.2425, 0.2506, 0.2438, 0.2144, 0.2488, 0.2620, 0.2601, 0.2380,
         0.2971]], device='cuda:0', requires_grad=True)
28.52066946029663
0.68100905418396
Epoch 2
Parameter containing:
tensor([[0.2151, 0.2130, 0.2211, 0.2143, 0.1850, 0.2194, 0.2332, 0.2316, 0.2086,
         0.2675]], device='cuda:0', requires_grad=True)
28.019900143146515
0.6663017272949219
Epoch 3
Parameter containing:
tensor([[0.1841, 0.1818, 0.1899, 0.1832, 0.1541, 0.1884, 0.2038, 0.2053, 0.1775,
         0.2364]], device='cuda:0', requires_grad=True)
27.23625671863556
0.6478971838951111
Epoch 4
Parameter containing:
tensor([[0.1530, 0.1502, 0.1582, 0.1516, 0.1230, 0.1575, 0.1771, 0.1889, 0.1460,
         0.2048]], device='cuda:0', requires_grad=True)
26.69122970104217

KeyboardInterrupt: 

In [4]:
model.fusion_layer.weight_tensor.factors[0].device

device(type='cuda', index=0)

In [6]:
batch_size = 32
epochs = 50
# settings from https://github.com/Justin1904/TensorFusionNetworks/blob/master/train.py
input_dims = (5, 20, 300)
hidden_dims = (4, 16, 128)
text_out = 64
dropouts = (0.3, 0.3, 0.3, 0.3)
post_fusion_dim = 32
model = TFN(input_dims, hidden_dims, text_out, dropouts, post_fusion_dim, device=device, dtype=DTYPE)

In [7]:
epochs = 30
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(list(model.parameters())[2:])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3, verbose=True)
for e in range(epochs):
    print('Epoch {}'.format(e))
    train_loss = 0.0
    model.train()
    for text, audio, vision, label in train_dataloader:
        model.zero_grad()
        output = model(audio, vision, text)
        loss = criterion(output, label)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print(train_loss)
    
    model.eval()
    for text, audio, vision, label in valid_dataloader:
        output = model(audio, vision, text)
        valid_loss = criterion(output, label).item()
    scheduler.step(valid_loss)
    print(valid_loss)

Epoch 0
27.746791183948517
0.5983000993728638
Epoch 1
22.599841356277466
0.5741891860961914
Epoch 2
20.9776431620121
0.5182520747184753
Epoch 3
18.447107419371605
0.5316178798675537
Epoch 4
16.16508023440838
0.5781940817832947
Epoch 5
13.750149548053741
0.5866302847862244
Epoch 6
12.41133339703083
Epoch     7: reducing learning rate of group 0 to 1.0000e-04.
0.5868079662322998
Epoch 7
9.790406368672848
0.6064910292625427
Epoch 8
9.155158430337906
0.6138084530830383
Epoch 9
8.670896269381046
0.6321585178375244
Epoch 10
8.77745607495308
Epoch    11: reducing learning rate of group 0 to 1.0000e-05.
0.6368790864944458
Epoch 11
8.624453119933605
0.6399129629135132
Epoch 12
8.29000923037529
0.6426395177841187
Epoch 13
8.115225300192833
0.6466211080551147
Epoch 14
7.943665310740471
Epoch    15: reducing learning rate of group 0 to 1.0000e-06.
0.64751797914505
Epoch 15
7.860414206981659
0.6442150473594666
Epoch 16
8.334999352693558
0.6477405428886414
Epoch 17
7.960166782140732
0.64896446466445

In [7]:
batch_size = 32
epochs = 50
# settings from https://github.com/Justin1904/TensorFusionNetworks/blob/master/train.py
input_dims = (5, 20, 300)
hidden_dims = (4, 16, 128)
text_out = 64
dropouts = (0.3, 0.3, 0.3, 0.3)
rank = 4
output_dim = 1
model = LMF(input_dims, hidden_dims, text_out, dropouts, output_dim, rank, use_softmax=False, device=device, dtype=DTYPE)

In [8]:
criterion = nn.BCEWithLogitsLoss()
factors = list(model.parameters())[:3]
other = list(model.parameters())[3:]
factor_lr = 0.0005
lr = 0.001
optimizer = optim.Adam([{"params": factors, "lr": factor_lr}, {"params": other, "lr": lr}])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, verbose=True)
for e in range(epochs):
    print('Epoch {}'.format(e))
    train_loss = 0.0
    model.train()
    for text, audio, vision, label in train_dataloader:
        model.zero_grad()
        output = model(audio, vision, text)
        loss = criterion(output, label)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    print(train_loss)
    
    model.eval()
    for text, audio, vision, label in valid_dataloader:
        output = model(audio, vision, text)
        valid_loss = criterion(output, label).item()
    scheduler.step(valid_loss)
    print(valid_loss)

Epoch 0
27.707884550094604
0.6103724837303162
Epoch 1
25.751307010650635
0.5993456244468689
Epoch 2
21.946386218070984
0.506530225276947
Epoch 3
19.738655865192413
0.5119958519935608
Epoch 4
17.642503082752228
0.4785168170928955
Epoch 5
16.091687828302383
0.5398954153060913
Epoch 6
14.24790771305561
0.5091980695724487
Epoch 7
11.752081230282784
0.49786096811294556
Epoch 8
10.936970323324203
0.6509965658187866
Epoch 9
8.470754906535149
0.7462157011032104
Epoch 10
6.700329817831516
Epoch    11: reducing learning rate of group 0 to 5.0000e-05.
Epoch    11: reducing learning rate of group 1 to 1.0000e-04.
0.6205970048904419
Epoch 11
4.487218387424946
0.656859278678894
Epoch 12
4.011319886893034
0.7138518691062927
Epoch 13
2.7622850690968335
0.7538882493972778
Epoch 14
2.608662152197212
0.8387965559959412
Epoch 15
2.2656643863301724
0.8688085675239563
Epoch 16
1.9815785735845566
Epoch    17: reducing learning rate of group 0 to 5.0000e-06.
Epoch    17: reducing learning rate of group 1 to 1

In [11]:
model.fusion_layer.weight_tensor.factors[0].dtype

torch.float32

In [27]:
list(model.fusion_layer.rank_param)

[tensor(0.0361, grad_fn=<UnbindBackward>),
 tensor(0.4688, grad_fn=<UnbindBackward>),
 tensor(0.1688, grad_fn=<UnbindBackward>),
 tensor(0.7413, grad_fn=<UnbindBackward>),
 tensor(0.5441, grad_fn=<UnbindBackward>),
 tensor(0.8754, grad_fn=<UnbindBackward>),
 tensor(0.3675, grad_fn=<UnbindBackward>),
 tensor(0.9385, grad_fn=<UnbindBackward>),
 tensor(0.8079, grad_fn=<UnbindBackward>),
 tensor(0.3892, grad_fn=<UnbindBackward>)]

In [16]:
model.fusion_layer.rank_param

Parameter containing:
tensor([0.0560, 0.3947, 0.4498, 0.4022, 0.3426, 0.5294, 0.5075, 0.6149, 0.6144,
        0.5896], requires_grad=True)