Negative log likelihood loss

https://www.sciencedirect.com/topics/computer-science/negative-log-likelihood

Trying to implement the negative log loss for the priors

$p(\theta)=p(\lambda)\prod_n p(U^{(n)}|\lambda)$\
$\ \ \ \ \ \ \ =\prod_j p(\lambda_j) \prod_n \prod_{i,j} \mathcal N(u_{i,j}^{(n)}|0,\lambda_j)$

$\ \ \ -\log p(\theta)$\
$= -\log \left( \prod_j p(\lambda_j) \prod_n \prod_{i,j} \mathcal N(u_{i,j}^{(n)}|0,\lambda_j) \right)$\
$= -\left( \sum_j \log p(\lambda_j) + \sum_n \sum_{i,j} \log \mathcal N(u_{i,j}^{(n)}|0,\lambda_j) \right)$

In [1]:
import torch
import torch.nn as nn
from scipy.stats import halfcauchy, loguniform, norm, truncnorm
import numpy as np

from torch.distributions.half_cauchy import HalfCauchy
from torch.distributions.normal import Normal

import pickle

from datasets import Multimodal_Binary_Dataset

import torch.optim as optim

from torch.autograd import Variable
from torch.utils.data import DataLoader
from model import SubNet, TextSubNet

import time

In [2]:
class Adaptive_CP_Linear(nn.Module):
    
    def __init__(self, input_sizes, output_size, max_rank):
        
        super(Adaptive_CP_Linear, self).__init__()
        
        self.input_sizes = input_sizes
        self.output_size = output_size
        
        shape = input_sizes + (output_size,)
        self.weight = CP(shape, max_rank)
        
    def forward(self, inputs):
        y = 1.0
        for i, x in enumerate(inputs):
            y = y * (x @ self.weight.factors[i])
        y = y @ self.weight.factors[-1].T

        return y

In [27]:
class CP(nn.Module):
    
    def __init__(self, shape, max_rank, prior_type='log_uniform', eta=None):
        
        super(CP, self).__init__()
        
        self.shape = shape
        self.order = len(shape)
        self.max_rank = max_rank
        self.prior_type = prior_type
        
        self.factors = nn.ParameterList([nn.init.xavier_normal_(nn.Parameter(torch.empty(s, max_rank)))
                                         for s in shape])
        
        self.rank_params = nn.Parameter(torch.rand(max_rank))
        
    def log_priors(self):
        
        log_priors = 0.0
        
        rank_params_dist = HalfCauchy(1.0)
        self.rank_params = nn.Parameter(nn.functional.relu(self.rank_params))
        try:
            log_priors += rank_params_dist.log_prob(self.rank_params).sum()
        except ValueError:
            print(self.rank_params)
            
        factors_dist = Normal(0, self.rank_params)
        for f in self.factors:
            try:
                log_priors += factors_dist.log_prob(f).sum()
            except ValueError:
                print(f)
        return log_priors
    '''
    def init_factors(self):
        
        target_stddev = np.sqrt(2/np.prod(self.shape[:-1]))
        factor_stddev = np.power(target_stddev / self.max_rank, 1 / self.order)
        init_dist = truncnorm(a=-3.0*factor_stddev, b=3.0*factor_stddev, 
                              loc=0.0, scale=factor_stddev)
        for s in self.shape:
            self.factors.append(nn.Parameter(torch.tensor(init_dist.rvs((s, self.max_rank)), 
                                                          dtype=torch.float32)))
    '''

In [52]:
def train_cmu_mosi(batch_size=32, epochs=100, lr=.001, max_rank=20):

    # load dataset file
    file = open('../../dataset/cmu-mosi/mosi_20_seq_data.pkl', 'rb')
    data = pickle.load(file)
    file.close()

    # prepare the datasets and data loaders
    train_set = Multimodal_Binary_Dataset(data['train']['text'], data['train']['audio'],
                                  data['train']['vision'], data['train']['labels'])
    valid_set = Multimodal_Binary_Dataset(data['valid']['text'], data['valid']['audio'],
                                  data['valid']['vision'], data['valid']['labels'])

    train_dataloader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    valid_dataloader = DataLoader(valid_set, batch_size=len(valid_set))

    # set up model
    input_sizes = (train_set[0][0]['audio'].shape[0], train_set[0][0]['vision'].shape[0],
                   train_set[0][0]['text'].shape[1])
    hidden_sizes = (32, 32, 128)
    output_size = 1
    
    model = CP_Tensor_Fusion_Network(input_sizes, hidden_sizes, output_size, max_rank)
    
    # set up training
    DTYPE = torch.FloatTensor
    optimizer = optim.Adam(list(model.parameters()), lr=lr)
    criterion = nn.BCEWithLogitsLoss()
    
    # train and validate
    for e in range(1, epochs + 1):
        # train
        tic = time.time()
        model.train()
        train_loss = 0.0
        train_log_prior = 0.0
        for batch in train_dataloader:
            model.zero_grad()

            features, label = batch
            
            x_a = Variable(features['audio'].float().type(DTYPE), requires_grad=False)
            x_v = Variable(features['vision'].float().type(DTYPE), requires_grad=False)
            x_t = Variable(features['text'].float().type(DTYPE), requires_grad=False)
            y = Variable(label.view(-1, 1).float().type(DTYPE), requires_grad=False)
            
            output = model([x_a, x_v, x_t])
            nll = criterion(output, y)
            '''
            if e > 50:
                log_prior = model.tensor_fusion_layer.weight.log_priors()
                loss = nll - 1e-5 * log_prior
                
                # train_log_prior += log_prior.item()
            else:
                loss = nll
            '''
            loss = nll
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            
        print('Train Loss: {:.4f}'.format(train_loss))
        # print('Train Log Prior: {:.4f}'.format(train_log_prior))
    
        # validate
        model.eval()
        for batch in valid_dataloader:
            features, label = batch
            x_a = Variable(features['audio'].float().type(DTYPE), requires_grad=False)
            x_v = Variable(features['vision'].float().type(DTYPE), requires_grad=False)
            x_t = Variable(features['text'].float().type(DTYPE), requires_grad=False)
            y = Variable(label.view(-1, 1).float().type(DTYPE), requires_grad=False)

            output = model([x_a, x_v, x_t])

        valid_mse = nn.functional.binary_cross_entropy_with_logits(y, output).item()
        print("Valid MSE {:.3f}".format(valid_mse))

In [53]:
class CP_Tensor_Fusion_Network(nn.Module):

    def __init__(self, input_sizes, hidden_sizes, output_size, max_rank):

        super(CP_Tensor_Fusion_Network, self).__init__()

        self.input_sizes = input_sizes
        self.hidden_sizes = hidden_sizes
        self.output_size = output_size
        self.max_rank = max_rank
        
        self.audio_subnet = SubNet(input_sizes[0], hidden_sizes[0], dropout=0.3)
        self.video_subnet = SubNet(input_sizes[1], hidden_sizes[1], dropout=0.3)
        self.text_subnet = TextSubNet(input_sizes[2], hidden_sizes[2], hidden_sizes[2], dropout=0.3)
        
        tensor_input_sizes = (hidden_sizes[0] + 1, hidden_sizes[1] + 1, hidden_sizes[2] + 1)
        self.tensor_fusion_layer = Adaptive_CP_Linear(tensor_input_sizes, output_size, max_rank)
        
    def forward(self, inputs):
        
        # subnet outputs
        z_audio = self.audio_subnet(inputs[0])
        z_video = self.video_subnet(inputs[1])
        z_text = self.text_subnet(inputs[2])

        batch_size = z_audio.data.shape[0]

        if z_audio.is_cuda:
            DTYPE = torch.cuda.FloatTensor
        else:
            DTYPE = torch.FloatTensor

        # 1 in concatenated to each subnet outputs
        z_audio = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), z_audio), dim=1)
        z_video = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), z_video), dim=1)
        z_text = torch.cat((Variable(torch.ones(batch_size, 1).type(DTYPE), requires_grad=False), z_text), dim=1)

        output = self.tensor_fusion_layer([z_audio, z_video, z_text])
        
        return output

In [54]:
train_cmu_mosi()

Train Loss: 27.3724
Valid MSE 0.467
Train Loss: 22.4535
Valid MSE -0.614
Train Loss: 23.2774
Valid MSE 0.527
Train Loss: 19.3084
Valid MSE 0.266
Train Loss: 16.5892
Valid MSE 0.187
Train Loss: 14.3519
Valid MSE 0.236
Train Loss: 12.8767
Valid MSE -0.101
Train Loss: 9.7427
Valid MSE -0.534
Train Loss: 7.9125
Valid MSE -0.184
Train Loss: 8.7193
Valid MSE -2.026
Train Loss: 5.9413
Valid MSE -1.428
Train Loss: 4.5535
Valid MSE -1.079
Train Loss: 4.5200
Valid MSE -2.291
Train Loss: 7.5126
Valid MSE -1.829
Train Loss: 2.9138
Valid MSE -3.196
Train Loss: 2.1087
Valid MSE -2.633
Train Loss: 2.2143
Valid MSE -4.069
Train Loss: 6.9933
Valid MSE -0.731
Train Loss: 2.6148
Valid MSE -2.212
Train Loss: 1.4621
Valid MSE -2.059
Train Loss: 1.1178
Valid MSE -2.771
Train Loss: 0.5928
Valid MSE -4.154
Train Loss: 0.9167
Valid MSE -3.427
Train Loss: 0.4984
Valid MSE -3.373
Train Loss: 0.4403
Valid MSE -4.142
Train Loss: 1.0762
Valid MSE -3.803
Train Loss: 1.2571
Valid MSE -3.565
Train Loss: 0.7272
Valid M

KeyboardInterrupt: 