# Order matters 
Modifying code from http://nlp.seas.harvard.edu/2018/04/03/attention.html to implement the architechture from https://arxiv.org/pdf/1511.06391.pdf

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_sequence
#import matplotlib.pyplot as plt
#import seaborn
#seaborn.set_context(context="talk")
#%matplotlib inline

import sys
sys.path.append('../scripts')
#from order_matters import Read, Process, Write, ReadProcessWrite

In [2]:
# Usual imports
import time
import math
import numpy as np
import os
#import matplotlib.pyplot as plt
import argparse
import pickle
from glob import glob
import random

#Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
from torch.backends import cudnn
from torch.optim import Adam
from torch.utils.data.dataloader import default_collate

#tensorboard
from tensorboardX import SummaryWriter

#my modules
from dataset import DigitsDataset, WordsDataset, VideosDataset


In [3]:
class ReadLinear(nn.Module):
    """
    A read block from the Order Matters architechture. In the case of digits reordering, a small multilayer perceptron
    implemented as 1d conv. Specifically, if the input is of shape (batch size, set_length, input_dim), conv1d with
    1x1 kernel size and F output filters will give us an output shape of (batch size, set_length, F)
    
    Paramters
    ---------
    hidden_dim: list of sizes of the embedding at the different layers of the MLP encoder
    """
    def __init__(self, hidden_dims, input_dim=1):
        super(ReadLinear, self).__init__()
        self.dims = [input_dim] + hidden_dims
        self.Ws = [nn.Parameter(torch.randn(self.dims[i+1], self.dims[i])) for i in range(len(self.dims)-1)]
        self.bs = [nn.Parameter(torch.randn(self.dims[i+1])) for i in range(len(self.dims)-1)]
        if torch.cuda.is_available():
            device = f'cuda:{torch.cuda.current_device()}' 
            self.Ws = [W.to(device) for W in self.Ws]
            self.bs = [b.to(device) for b in self.bs]
        
        self.nonlinearity = nn.ReLU6()
        
    def forward(self, x, n_layers=1):
        """
        x is a batch of sets of shape (batch size, input_dim, set_length) to fit the expected shape of conv1d
        We loop over the number of layer of the MLP and for each laer we compute the output of the layer with the corresponding W and b
        """
        
        x = x.permute(0,2,1,3).unsqueeze(-1) #shape (batch size, set_length, input_dim, 1)
        
        #reducing by using max
        x = torch.max(x, dim=2)[0]
        #print(f'X shape: {x.shape}')
        for i in range(len(self.dims)-1):
            
            W = self.Ws[i].unsqueeze(0).unsqueeze(0) #final shape (1, 1, input_dim, output_dim)
            b = self.bs[i].unsqueeze(0).unsqueeze(0).unsqueeze(-1)
            #print(f'x size: {x.size()}, W size: {W.size()}, b size: {b.size()}')
            x = self.nonlinearity(torch.matmul(W, x)  + b) # shape (batch size, set_length, hidden_dim, 1)
            
        x = x.squeeze(-1).permute(0,2,1) # shape (batch size, hidden_dim, set_length)
        
        return x

In [4]:
class ReadWordEncoder(nn.Module):
    """
    A read block from the Order Matters architechture. In the character level word encoding, a small multilayer perceptron
    implemented as 1d conv. Specifically, the input is of shape (batch size, set_length, max_word_length, input_size). 
    
    Paramters
    ---------
    hidden_dims: size of the embedding for the consecutive LSTM layers
    input_size: character level vocab_size. Default to 26
    """
    
    def __init__(self, hidden_dims, input_size=26):
        super(ReadWordEncoder, self).__init__()
        
        self.dims = [input_size] + hidden_dims
        self.lstms = [nn.LSTM(input_size=self.dims[i], hidden_size=self.dims[i+1], num_layers=1, batch_first=True) for i in range(len(self.dims)-1)]
        if torch.cuda.is_available():
            device = f'cuda:{torch.cuda.current_device()}' 
            self.lstms = [lstm.to(device) for lstm in self.lstms]
        
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_dims[-1], num_layers=1, batch_first=True)
        
    def forward(self, x):
        """
        x is of shape (batch_size, n_set, max_word_length, vocab_size)
        we need to loop over the batch size because lstm batch 1st take input (batch, seq_length, vocab_size)
        and so for each element of the batch we have batch -> n_set, seq_length -> max_word_length, vocab_size -> vocab_size
        """
        #print(f'X[i,:,:,:] shape: {x[0, :, :, :].size()}')
        l = []
        for i in range(x.size(0)):
            """
            #h_n = x[i, :, :, :]
            outputs = x[i, :, :, :]
            for j in range(len(self.dims)-1):
                #outputs, (h_n, c_n) =  self.lstms[j](h_n)
                outputs, (h_n, c_n) =  self.lstms[j](outputs)
            #l.append(h_n)
            #print(f'h_n shape: {h_n.size()}')
            """
            
            outputs, (h_n, c_n) =  self.lstm(x[i, :, :, :])
            
            l.append(h_n)
        res = torch.cat(l, dim=0).permute(0,2,1) #shape (batch_size, hidden_dim, n_set)
        return res

In [5]:
class ReadVideoEncoder(nn.Module):
    """
    A read block from the Order Matters architechture. In the case of digits reordering, a small multilayer perceptron
    implemented as 1d conv. Specifically, if the input is of shape (batch size, set_length, input_dim), conv1d with
    1x1 kernel size and F output filters will give us an output shape of (batch size, set_length, F)
    
    Paramters
    ---------
    hidden_dim: list of sizes of the embedding at the different layers of the MLP encoder
    """
    def __init__(self, hidden_dims, input_dim=1):
        super(ReadLinear, self).__init__()
        self.dims = [input_dim] + hidden_dims
        self.Ws = [nn.Parameter(torch.randn(self.dims[i+1], self.dims[i])) for i in range(len(self.dims)-1)]
        self.bs = [nn.Parameter(torch.randn(self.dims[i+1])) for i in range(len(self.dims)-1)]
        if torch.cuda.is_available():
            device = f'cuda:{torch.cuda.current_device()}' 
            self.Ws = [W.to(device) for W in self.Ws]
            self.bs = [b.to(device) for b in self.bs]
        
        self.nonlinearity = nn.ReLU6()
        
    def forward(self, x, n_layers=1):
        """
        x is a batch of sets of shape (batch size, input_dim, set_length) to fit the expected shape of conv1d
        We loop over the number of layer of the MLP and for each laer we compute the output of the layer with the corresponding W and b
        """
        #print(f'X shape: {x.shape}')
        x = x.permute(0,2,1).unsqueeze(-1) #shape (batch size, set_length, input_dim, 1)
        for i in range(len(self.dims)-1):
            
            W = self.Ws[i].unsqueeze(0).unsqueeze(0) #final shape (1, 1, input_dim, output_dim)
            b = self.bs[i].unsqueeze(0).unsqueeze(0).unsqueeze(-1)
            #print(f'x size: {x.size()}, W size: {W.size()}, b size: {b.size()}')
            x = self.nonlinearity(torch.matmul(W, x)  + b) # shape (batch size, set_length, hidden_dim, 1)
            
        x = x.squeeze(-1).permute(0,2,1) # shape (batch size, hidden_dim, set_length)
        
        return x

In [6]:
class Process(nn.Module):
    """
    A Process block from the Order Matters architechture. Implemented via a self attention mechanism where in order 
    to compute the next state, we run r_t the attention vector as input for the next step.
    """
    def __init__(self, input_dim, hidden_dim, lstm_steps, batch_size):
        """
        """
        super(Process, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.lstm_steps = lstm_steps
        self.batch_size = batch_size
        self.lstmcell = nn.LSTMCell(self.input_dim, self.hidden_dim, bias=True)
        ##QUESTION: Should these be initialized to the same value for each member of the batch ?
        ### TODO: look into how to initialize LSTM state/output
        self.i0 = nn.Parameter(torch.zeros(self.input_dim), requires_grad=False)
        #self.h_0 = nn.Parameter(torch.randn(self.hidden_dim), requires_grad=False)
        self.h_0 = nn.Parameter(torch.zeros(self.hidden_dim), requires_grad=False)
        #self.c_0 = nn.Parameter(torch.randn(self.hidden_dim), requires_grad=False)
        self.c_0 = nn.Parameter(torch.zeros(self.hidden_dim), requires_grad=False)
        
        
    def forward(self, M, mask=None, dropout=None):
        """
        c_t is the state the LSTM evolves, aka q_t from the order matters paper
        h and c are initialized randomly
        the dot product is scaled to avoid it exploding with the embedding dimension
        
        The out put, q_t_star = (q_t, r_t) is the linear  is projected with a linear layer to the size of the state of the write LSTM, and used as its initial state
        
        Parameters
        ----------
        M: the memories tensor or shape ((batch size, hidden_dim, set_length))
        """
        #To account for the last batch that might not have the same length as the rest
        batch_size = M.size(0)
        i0 = self.i0.unsqueeze(0).expand(batch_size, -1)
        h_0 = self.h_0.unsqueeze(0).expand(batch_size, -1)
        c_0 = self.c_0.unsqueeze(0).expand(batch_size, -1)
        
        for _ in range(self.lstm_steps):
            if _ == 0:
                h_t_1 = h_0
                c_t_1 = c_0
                r_t_1 = i0
            h_t, c_t = self.lstmcell(r_t_1, (h_t_1, c_t_1))
            d_k = h_t.size(-1)
            h_t.size(-1)
            
            #h_t is of shape (batch_size, hidden_dim) so we expand it
            #try:
            scores = torch.matmul(M.transpose(-2, -1), h_t.unsqueeze(2)) \
                         / math.sqrt(d_k)
            #except:
            #    print(f'M: {M.transpose(-2, -1).size()}, h_t: {h_t.size()}')
            #    raise RuntimeError('Score error')
                
            if mask is not None:
                scores = scores.masked_fill(mask == 0, -1e9)
            p_attn = F.softmax(scores, dim = -1)
            if dropout is not None:
                p_attn = dropout(p_attn)
            r_t_1 = torch.matmul(M, p_attn).squeeze(-1)
            #print(f'r_t_1: {r_t_1.size()}')
            h_t_1 = h_t
            c_t_1 = c_t
            
        """
        return (r_t_1, h_t_1)
        """
        return (r_t_1, c_t_1)

In [7]:
class Attention(nn.Module):
    """
    Attention model for Pointer-Net taken from https://github.com/shirgur/PointerNet/blob/master/PointerNet.py
    """

    def __init__(self, ctx_dim, 
                 hidden_dim):
        """
        Initiate Attention
        :param int input_dim: Input's dimension
        :param int hidden_dim: Number of hidden units in the attention
        """

        super(Attention, self).__init__()

        self.ctx_dim = ctx_dim
        self.hidden_dim = hidden_dim

        self.input_linear = nn.Linear(hidden_dim, hidden_dim)
        self.context_linear = nn.Conv1d(ctx_dim, hidden_dim, 1, 1)
        self.V = nn.Parameter(torch.FloatTensor(hidden_dim), requires_grad=True)
        self._inf = nn.Parameter(torch.FloatTensor([float('-inf')]), requires_grad=False)
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)

        # Initialize vector V
        nn.init.uniform_(self.V, -1, 1)

    def forward(self, input,
                context,
                mask):
        """
        Attention - Forward-pass
        :param Tensor input: Hidden state h (as said in the Pointer's Network paper:  For the LSTM RNNs, 
        we use the state after the output gate has been component-wise multiplied by the cell activations. #(batch_size, hidden_dim)
        
        :param Tensor context: Attention context #(batch_size, hidden_dim, seq_len)
        :param ByteTensor mask: Selection mask #(batch_size, n_set)
        
        :return: tuple of - (Attentioned hidden state, Alphas)
        """

        # input is of shape (batch, hidden_dim) so inp will be of shape (batch_size, hidden_dim, seq_len)
        inp = self.input_linear(input.unsqueeze(2).transpose(-2, -1)).transpose(-2, -1).repeat(1,1,context.size(-1))

        # context is M from the process block shape (batch, input_dim, seq_len)
        #so ctx is of shape (batch, hidden_dim, seq_len)
        ctx = self.context_linear(context)

        # V will of shape (batch, 1, hidden_dim)
        V = self.V.unsqueeze(0).expand(context.size(0), -1).unsqueeze(1)

        # att will be of shape (batch, seq_len)
        att = torch.bmm(V, self.tanh(inp + ctx)).squeeze(1)
        if len(att[mask]) > 0:
            att[mask] = self.inf[mask]
        
        alpha = self.softmax(att)

        hidden_state = torch.bmm(ctx, alpha.unsqueeze(2)).squeeze(2)

        return hidden_state, alpha

    def init_inf(self, mask_size):
        self.inf = self._inf.unsqueeze(1).expand(*mask_size)


In [8]:
class Write(nn.Module):
    """
    A Write block from the Order Matters architechture. 
    """
    
    def __init__(self, embedding_dim,
                 hidden_dim):
        """
        Initiate Decoder
        :param int embedding_dim: Number of embeddings in Pointer-Net
        :param int hidden_dim: Number of hidden units for the decoder's RNN
        """

        super(Write, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        self.input_to_hidden = nn.Linear(embedding_dim, 4 * hidden_dim)
        self.hidden_to_hidden = nn.Linear(hidden_dim, 4 * hidden_dim)
        self.hidden_out = nn.Linear(hidden_dim * 2, hidden_dim)
        self.att = Attention(embedding_dim, hidden_dim)

        # Used for propagating .cuda() command
        self.mask = nn.Parameter(torch.ones(1), requires_grad=False)
        self.runner = nn.Parameter(torch.zeros(1), requires_grad=False)
        self.lstmcell  = nn.LSTMCell(embedding_dim, hidden_dim, bias=True)
    def forward(self, embedded_inputs,
                decoder_input,
                hidden,
                context):
        """
        Decoder - Forward-pass
        :param Tensor embedded_inputs: Embedded inputs of Pointer-Net #(batch_size, hidden_dim, n_set)
        :param Tensor decoder_input: First decoder's input #(batch_size, hidden_dim)
        :param Tensor hidden: First decoder's hidden states #((batch_size, hidden_dim),(batch_size, hidden_dim)
        :param Tensor context: Encoder's outputs #(batch_size, hidden_dim, n_set)
        :return: (Output probabilities, Pointers indices), last hidden state
        """

        batch_size = embedded_inputs.size(0)
        # The size of the set
        input_length = embedded_inputs.size(2)

        # (batch, seq_len)
        mask = self.mask.repeat(input_length).unsqueeze(0).repeat(batch_size, 1)
        self.att.init_inf(mask.size())

        # Generating arang(input_length), broadcasted across batch_size
        runner = self.runner.repeat(input_length)
        for i in range(input_length):
            runner.data[i] = i
        runner = runner.unsqueeze(0).expand(batch_size, -1).long()

        outputs = []
        pointers = []

        def step(x, hidden):
            """
            Recurrence step function
            :param Tensor x: Input at time t shape(batch_size, embedding_dim)
            :param tuple(Tensor, Tensor) hidden: Hidden states at time t-1
            :return: Hidden states at time t (h, c), Attention probabilities (Alpha)
            """

            # Regular LSTM
            h, c = hidden #shapes ((batch_size, hidden_dim), (batch_size, hidden_dim))
            #print(f'h shape: {h.size()}')
            #print(f'x shape: {x.size()}')
            
            gates = self.input_to_hidden(x) + self.hidden_to_hidden(h.squeeze())
            #gates = self.hidden_to_hidden(h.squeeze())
            #print(f'gates shape: {gates.size()}')
            input, forget, cell, out = gates.chunk(4, 1)

            input = torch.sigmoid(input)
            forget = torch.sigmoid(forget)
            cell = torch.tanh(cell)
            out = torch.sigmoid(out)

            c_t = (forget * c) + (input * cell)
            h_t = out * torch.tanh(c_t)
            #print(f'out: {out.size()}, c_t: {c_t.size()}, h_t: {h_t.size()}')

            # Attention section
            hidden_t, output = self.att(h_t, context, torch.eq(mask, 0))
            hidden_t = torch.tanh(self.hidden_out(torch.cat((hidden_t, h_t), 1)))

            return hidden_t, c_t, output
        
        def step_2(x, hidden):
            h, c = hidden
            (h_t, c_t) =  self.lstmcell(x, (h, c))
            #print('h_t size: ', h_t.size())
            # Attention section
            hidden_t, output = self.att(h_t, context, torch.eq(mask, 0))
            hidden_t = torch.tanh(self.hidden_out(torch.cat((hidden_t, h_t), 1)))
            
            return hidden_t, c_t, output

        # Recurrence loop
        for _ in range(input_length):
            #h_t, c_t, outs = step(decoder_input, hidden)
            h_t, c_t, outs = step_2(decoder_input, hidden)
            hidden = (h_t, c_t)
            
            # Masking selected inputs
            masked_outs = outs * mask

            # Get maximum probabilities and indices
            max_probs, indices = masked_outs.max(1)
            one_hot_pointers = (runner == indices.unsqueeze(1).expand(-1, outs.size()[1])).float()

            # Update mask to ignore seen indices
            mask  = mask * (1 - one_hot_pointers)

            # Get embedded inputs by max indices
            embedding_mask = one_hot_pointers.unsqueeze(1).expand(-1, self.embedding_dim, -1).byte()
            decoder_input = embedded_inputs[embedding_mask.data].view(batch_size, self.embedding_dim)

            outputs.append(outs.unsqueeze(0))
            pointers.append(indices.unsqueeze(1))

        outputs = torch.cat(outputs).permute(1, 0, 2)
        pointers = torch.cat(pointers, 1)

        return outputs, pointers, hidden

In [9]:
class ReadProcessWrite(nn.Module):
    """
    The full read-process-write from the order matters paper.
    """
    def __init__(self, read_hidden_dims, write_hidden_dim, lstm_steps, batch_size, input_dim=1, reader='linear'):
        super(ReadProcessWrite, self).__init__()
        self.readers_dict = {'linear': ReadLinear, 'words': ReadWordEncoder, 'videos': ReadLinear}
        
        #print(f'hidden_dim: {hidden_dim}, input_dim: {input_dim}')
        self.decoder_input0 = nn.Parameter(torch.zeros(read_hidden_dims[-1]))
        self.decoder_output0 = nn.Parameter(torch.zeros(write_hidden_dim))
        self.read = self.readers_dict[reader](read_hidden_dims, input_dim)
        self.process = Process(read_hidden_dims[-1], read_hidden_dims[-1], lstm_steps, batch_size)
        self.write = Write(read_hidden_dims[-1], write_hidden_dim)
        self.batch_size = batch_size
        self.process_to_write = nn.Linear(read_hidden_dims[-1] * 2, write_hidden_dim) #linear layer to project q_t_star to the hidden size of the write block
        
    def forward(self, x):
        batch_size = x.size(0)
        M = self.read(x)
        r_t, c_t = self.process(M)
        q_t_star = torch.cat([r_t, c_t], dim=-1) #shape (batch_size, 2*hidden_dim)
        #print(f'q_t_star shape: {q_t_star.size()}')
        
        #We project q_t_star using a linear layer to the hidden size of the write block to be the initial hidden state
        write_block_hidden_state_0 = self.process_to_write(q_t_star) #shape (batch_size, hidden_dim)
        write_block_output_state_0 = self.decoder_output0.unsqueeze(0).expand(batch_size, -1) #shape (batch_size, hidden_dim)
        decoder_input0 = self.decoder_input0.unsqueeze(0).expand(batch_size, -1) #shape (batch_size, hidden_dim)
        
        #print('decoder_input0: ', decoder_input0)
        decoder_hidden0 = (write_block_output_state_0, write_block_hidden_state_0)
        outputs, pointers, hidden = self.write(M,
                                               decoder_input0,
                                               decoder_hidden0,
                                                 M)
        return outputs, pointers, hidden

In [None]:
read_hidden_dims = [256]
process_hidden_dim = 128
write_hidden_dim = 64
lstm_steps = 10
batch_size = 128
input_dim = 2048


In [None]:
set_size = 5
x = torch.rand(batch_size, input_dim, set_size)
#if torch.cuda.is_available():
#    x = x.cuda()
#x

In [None]:
rpw = ReadProcessWrite(read_hidden_dims, write_hidden_dim, lstm_steps, batch_size, input_dim)

In [None]:
rpw(x)

# Training

In [10]:
def create_model(args):
    print("=> creating model")
    model = ReadProcessWrite(args.read_hidden_dims, args.write_hidden_dim, args.lstm_steps, args.batch_size, args.input_dim)
    
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.USE_CUDA:
                checkpoint = torch.load(args.resume)
            else:
                checkpoint = torch.load(args.resume, map_location='cpu')
            args.start_epoch = checkpoint['epoch']
            #try:
            #    args.best_map = checkpoint['val_map']
            #except KeyError as e:
            #    args.best_map = None
            # print(checkpoint['state_dict'].keys())
            try:
                model.load_state_dict(checkpoint['state_dict'])
            except RuntimeError as e:
                print('Could not load state_dict. Attempting to correct for DataParallel module.* parameter names. This may not be the problem however...')
                # This catches the case when the model file was save in DataParallel state
                # create new OrderedDict that does not contain `module.`
                from collections import OrderedDict
                new_state_dict = OrderedDict()
                for k, v in checkpoint['state_dict'].items():
                    name = k[7:] # remove `module.`
                    new_state_dict[name] = v
                # load params
                model.load_state_dict(new_state_dict)
            # print("=> loaded checkpoint '{}' (epoch {})"
            #       .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    
    return model

In [11]:
def write_weights(weights_indices, parameters, writer, n_iter):
    """
    Adds a current set of weights to the writer
    
    Parameters
    =========
    weights_indices: dict of the indices of the weights to 
    capture for each flattened weight vector
    
    parameters: list of tuple (name, torch.Tensor parameter vector)
    writer: the tensorboadX writer object
    n_iter: The iteration at which to save
    """
    weights_data = {}
    for name, param in parameters:
        if param.requires_grad:
            indices = weights_indices[name]
            for idx in indices:
                weights_data[f'{idx}'] = param.data.flatten()[idx]
            writer.add_scalars(f'data/weigths/{name}', weights_data, n_iter)
            weights_data = {}
                   

def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))
    torch.save(state, filename + '_latest.pth.tar')
    if is_best:
        shutil.copyfile(filename + '_latest.pth.tar', filename + '_best.pth.tar')

In [12]:
def collate_fn_list(batch):
    #print('type(batch[0])', type(batch[0]))
    #print('batch[0]: ', batch[0])
    
    batch = list(filter (lambda x:x is not None, batch))
    #print('len(batch): ', len(batch))
    
    if len(set([x[0].size(0) for x in batch])) > 1:
        padded_Xs = pad_sequence([x[0] for x in batch], batch_first=True)
        #print(f'padded_batch size: {padded_Xs.size()}')
        new_batch  = []
        for i in range(len(batch)):
            new_batch.append((padded_Xs[i],) + batch[i][1:])
        
        return default_collate(new_batch)
    else:
        return default_collate(batch)

In [13]:
def train(train_loader, val_loader, model, criterion, optimizer, epoch, writer, args):
    
    model.train()
    
    # Training
    running_loss = 0.0
    loader_len = len(train_loader)
    for i, data in enumerate(train_loader, 0):
        X, Y, additional_dict = data
        #print('X: ', X)
        # Transfer to GPU
        device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'
        X, Y = X.to(device).float(), Y.to(device)
        #print(f'X shape: {X.size()}, Y shape: {Y.size()}')
        #X, Y = X.cuda().float(), Y.cuda()

        # Model computations
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs, pointers, hidden = model(X)
        
        outputs = outputs.contiguous().view(-1, outputs.size()[-1])
        Y = Y.view(-1)
        #print(f'outputs: {outputs.size()}, Y: {Y.size()}')
        
        loss = criterion(outputs, Y)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % args.print_offset == args.print_offset -1:    # print every 10 mini-batches
            print('[%d, %5d] loss: %.6f' %
                  (epoch + 1, i + 1, running_loss /args.print_offset ))
            #print(f'outputs: {outputs[:15,:]}, Y: {Y[:15]}')
            writer.add_scalar('data/losses/train_loss', running_loss/args.print_offset, i + 1 + epoch*loader_len)
            write_weights(args.weights_indices, args.parameters, writer, i + 1 + epoch*loader_len)
            running_loss = 0
    

    # Validation
    avg_val_loss = val(val_loader, model, criterion, epoch)
    writer.add_scalar('data/losses/val_loss', running_loss/args.print_offset, (epoch+1)*loader_len)
    
    return avg_val_loss


In [14]:
def val(val_loader, model, criterion, epoch=0):

    # switch to eval mode
    model.eval()

    with torch.set_grad_enabled(False):
        val_loss = 0.0
        for cpt, data in enumerate(val_loader, 0):
            X, Y, additional_dict = data

            # Transfer to GPU
            #local_batch, local_labels = local_batch.to(device), local_labels.to(device)
            device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'
            X, Y = X.to(device).float(), Y.to(device)
            #X, Y = X.cuda().float(), Y.cuda()

            # forward + backward + optimize
            outputs, pointers, hidden = model(X)

            outputs = outputs.contiguous().view(-1, outputs.size()[-1])
            Y = Y.view(-1)
            loss = criterion(outputs, Y)
            val_loss += loss.item()

    #cpt here is the last cpt in the loop, len(validator_generator) -1
    print(f'Epoch {epoch + 1} validation loss: {val_loss / (cpt+1)}')

    return val_loss / (cpt+1)

In [15]:
def main(args):
    if torch.cuda.is_available():
        args.USE_CUDA = True
        print('Using GPU, %i devices.' % torch.cuda.device_count())
    else:
        args.USE_CUDA = False
        
        
    
    with open(args.pickle_file, 'rb') as f:
        dict_data = pickle.load(f)
        
    
    runs = glob(args.saveprefix+'/*')
    it = len(runs) + 1
    writer = SummaryWriter(os.path.join(args.tensorboard_saveprefix, str(it)))
    writer.add_text('Metadata', 'Run {} metadata :\n{}'.format(it, args,))
    
    dataset_class = DATASET_CLASSES[args.reader]
    
    train_ds = dataset_class(dict_data['train'])
    val_ds = dataset_class(dict_data['val'])
    
    train_loader = torch.utils.data.DataLoader(
            train_ds,
            batch_size=args.batch_size, shuffle=True,
            collate_fn = collate_fn_list,
            num_workers=args.workers, pin_memory=True,
    )
    
    val_loader = torch.utils.data.DataLoader(
            val_ds,
            batch_size=args.batch_size, shuffle=True,
            collate_fn = collate_fn_list,
            num_workers=args.workers, pin_memory=True)
    
    model = create_model(args)
    
    args.weights_indices = {}
    args.parameters = list(model.named_parameters())
    for name, param in args.parameters:
        if param.requires_grad:
            size = list(param.data.flatten().size())[0]
            args.weights_indices[name] = random.sample(range(size), min(5, size))
    
    
    if args.USE_CUDA:
        device = torch.cuda.current_device()
        #model.cuda()
        device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'
        model.to(device) 
        net = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count()))
        cudnn.benchmark = True
        
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = Adam(filter(lambda p: p.requires_grad,
                                    model.parameters()),
                             lr=args.lr)
    
    best_val_loss = np.inf
    for ind, epoch in enumerate(range(args.epochs)):
        val_loss = train(train_loader, val_loader, model, criterion, optimizer, epoch, writer, args)

        
        is_best = val_loss > best_val_loss
        if is_best:
            best_val_loss = val_loss
        save_checkpoint({
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'val_loss': val_loss,
        }, is_best, os.path.join(args.saveprefix, str(it), f'ep_{epoch+1}_map_{best_val_loss:.3}'))
    
    writer.close()

In [20]:
DATASET_CLASSES = {'linear': DigitsDataset, 'words': WordsDataset, 'videos': VideosDataset}
LETTERS = 'abcdefghijklmnopqrstuvwxyz'
#PICKLE_FILE = '../../s3-drive/set_to_sequence/video_reordering_18374_3937_5_2019-06-18_11:45:26.327081.pkl' 
PICKLE_FILE = '../../s3-drive/set_to_sequence/video_reordering_unpooled.pkl' 
RESUME = ''
BATCH_SIZE = 32
READ_HIDDEN_DIMS = [128, 128]
WRITE_HIDDEN_DIM = 64
LR = 1e-4
WEIGHT_DECAY = 1e-4
MOMENTUM = .9
NESTEROV = False
EPOCHS = 100
SAVEPREFIX = '../checkpoints/videos'
TENSORBOARD_SAVEPREFIX = '../tensorboard/videos'
LSTM_STEPS = 10
READER = 'videos'
INPUT_DIM = 2048
DROPOUT = 0.2
WORKERS = 4
PRINT_OFFSET = 100

"""
if torch.cuda.is_available():
    USE_CUDA = True
    print('Using GPU, %i devices.' % torch.cuda.device_count())
else:
    USE_CUDA = False
"""
    
    
parser = argparse.ArgumentParser()
ARGS =parser.parse_args(args=[])
ARGS.pickle_file = PICKLE_FILE
ARGS.saveprefix = SAVEPREFIX
ARGS.tensorboard_saveprefix = TENSORBOARD_SAVEPREFIX
ARGS.batch_size = BATCH_SIZE
ARGS.read_hidden_dims = READ_HIDDEN_DIMS
ARGS.write_hidden_dim = WRITE_HIDDEN_DIM
ARGS.lr = LR
ARGS.weight_decay = WEIGHT_DECAY
ARGS.momentum = MOMENTUM
ARGS.nesterov = NESTEROV
ARGS.epochs = EPOCHS
ARGS.lstm_steps = LSTM_STEPS
ARGS.input_dim = INPUT_DIM
ARGS.reader = READER
ARGS.dropout = DROPOUT
ARGS.workers = WORKERS
ARGS.resume =RESUME
ARGS.print_offset = PRINT_OFFSET
#ARGS.resume = RESUME
#ARGS.USE_CUDA = USE_CUDA

In [None]:
main(ARGS)

Using GPU, 4 devices.


# Junk

In [None]:
l = list(rpw.named_parameters())
for for name, param in rpw.named_parameters():
    if param.requires_grad:

In [None]:
import random
weights_indices = {}
l = list(rpw.named_parameters())
for name, param in l:
    if param.requires_grad:
        size = list(param.data.flatten().size())[0]
        weights_indices[name] = random.sample(range(size), 5)
weights_indices

In [None]:
def write_weights(weights_indices, parameters, writer)
    weights_data = {}
    for name, param in parameters:
        if param.requires_grad:
            indices = weights_indices[name]
            for idx in indices:
                weights_data[f'{name}.{idx}'] = params.data/flatten()[idx]
    writer.add_scalars('data/weights', weights_data)

In [None]:
outputs

In [None]:
X_train = np.random.uniform(size=(10000, 5))
Y_train = np.sort(X, axis=1)
X_val = np.random.uniform(size=(10000, 5))
Y_val = np.sort(X, axis=1)
Y_train[:5,:]

In [None]:
dict_data = {'attributes': None, 'split':{'train': [], 'val': []}}
for i in range(X_train.shape[0]):
    dict_data['split']['train'].append((X_train[i, :], Y_train[i,:]))
    dict_data['split']['val'].append((X_val[i, :], Y_val[i,:]))
dict_data