In [1]:
import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable

import json

import gzip
import pandas as pd

In [296]:
class Decoder(nn.Module):

    def forward(self, decoder_states, teacher_captions,
                use_teacher_forcing=False):
        """(BxD, BxKxV) -> BxKxV"""
        raise NotImplementedError
        

class LSTMDecoder(Decoder):

    def __init__(self, embedding_size, hidden_size, vocab_size,
                 num_lstm_layers, go_token=0, gpus=None):

        super(LSTMDecoder, self).__init__()
        self.num_lstm_layers = num_lstm_layers

        # Embed each token in vocab to a 128 dimensional vector
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.mapping = nn.Embedding(178, hidden_size)

        # batch_first: whether input and output are (batch, seq, feature)
        self.lstm = nn.LSTM(embedding_size, hidden_size, self.num_lstm_layers, batch_first=True)

        self.linear = nn.Linear(hidden_size, vocab_size)
        self.logsoftmax = nn.LogSoftmax()
        self.use_cuda = True if gpus else False
        self.gpus = gpus
        self.go_token = go_token

    def init_hidden(self, features):
        """
        Hidden states of the LSTM are initialized with features.
        c0 and h0 should have the shape of 1 * batch_size * hidden_size
        """

        c0 = self.mapping(features).unsqueeze(0)
        h0 = self.mapping(features).unsqueeze(0)
        return h0, c0

    def forward(self, features, captions, use_teacher_forcing=False):
        """
        This method computes the forward pass of the decoder with or without
        teacher forcing. It should be noted that the <GO> token is
        automatically appended to the input captions.
        Args:
            features: Video features extracted by the encoder.
            captions: Video captions (required if use_teacher_forcing=True).
            use_teacher_forcing: Whether to use teacher forcing or not.
        Returns:
            The probability distribution over the vocabulary across the entire
            sequence.
        """

        batch_size, num_step = captions.size()
        go_part = Variable(self.go_token * torch.ones(batch_size, 1).long())
        if self.use_cuda:
            go_part = go_part.cuda(self.gpus[0])

        if use_teacher_forcing:
            # Add go token and remove the last token for all captions
            captions_with_go_token = torch.cat([go_part, captions[:, :-1]], 1)
            probs, _ = self.apply_lstm(features, captions_with_go_token)
        else:
            # Without teacher forcing: use its own predictions as the next input
            probs = self.predict(features, go_part, num_step)

        return probs

    def apply_lstm(self, features, captions, lstm_hidden=None):

        if lstm_hidden is None:
            lstm_hidden = self.init_hidden(features)
        embedded_captions = self.embedding(captions)
        lstm_output, lstm_hidden = self.lstm(embedded_captions, lstm_hidden)

        # Project features in a 'vocab_size'-dimensional space
        lstm_out_projected = torch.stack([self.linear(h) for h in lstm_output],
                                         0)
        probs = torch.stack([self.logsoftmax(h) for h in lstm_out_projected], 0)

        return probs, lstm_hidden

    def predict(self, features, go_tokens, num_step=1):
        lstm_input = go_tokens
        output_probs = []
        lstm_hidden = None

        for i in range(num_step):
            probs, lstm_hidden = self.apply_lstm(features, lstm_input,
                                                 lstm_hidden)

            output_probs.append(probs)
            # Greedy decoding
            _, preds = torch.max(probs, dim=2)

            lstm_input = preds

        concatenated_probs = torch.cat(output_probs, dim=1)
        return concatenated_probs
    
    
    
class Decoder(nn.Module):

    def forward(self, decoder_states, teacher_captions,
                use_teacher_forcing=False):
        """(BxD, BxKxV) -> BxKxV"""
        raise NotImplementedError
        

class LSTMDecoder2(Decoder):

    def __init__(self, embedding_size, hidden_size, vocab_size,
                 num_lstm_layers, go_token=0, gpus=None):

        super().__init__()
        self.num_lstm_layers = num_lstm_layers

        # Embed each token in vocab to a 128 dimensional vector
        self.embedding = nn.Embedding(vocab_size, embedding_size)

        # batch_first: whether input and output are (batch, seq, feature)
        self.lstm = nn.LSTM(embedding_size, hidden_size, self.num_lstm_layers, batch_first=True)

        self.linear = nn.Linear(hidden_size, vocab_size)
        self.logsoftmax = nn.LogSoftmax()
        self.use_cuda = True if gpus else False
        self.gpus = gpus
        self.go_token = go_token

    def init_hidden(self, features):
        """
        Hidden states of the LSTM are initialized with features.
        c0 and h0 should have the shape of 1 * batch_size * hidden_size
        """

        c0 = features.unsqueeze(0)
        h0 = features.unsqueeze(0)
        return h0, c0

    def forward(self, features, captions, use_teacher_forcing=False):
        """
        This method computes the forward pass of the decoder with or without
        teacher forcing. It should be noted that the <GO> token is
        automatically appended to the input captions.
        Args:
            features: Video features extracted by the encoder.
            captions: Video captions (required if use_teacher_forcing=True).
            use_teacher_forcing: Whether to use teacher forcing or not.
        Returns:
            The probability distribution over the vocabulary across the entire
            sequence.
        """

        batch_size, num_step = captions.size()
        go_part = Variable(self.go_token * torch.ones(batch_size, 1).long())
        if self.use_cuda:
            go_part = go_part.cuda(self.gpus[0])

        if use_teacher_forcing:
            # Add go token and remove the last token for all captions
            captions_with_go_token = torch.cat([go_part, captions[:, :-1]], 1)
            probs, _ = self.apply_lstm(features, captions_with_go_token)
        else:
            # Without teacher forcing: use its own predictions as the next input
            probs = self.predict(features, go_part, num_step)

        return probs

    def apply_lstm(self, features, captions, lstm_hidden=None):

        if lstm_hidden is None:
            lstm_hidden = self.init_hidden(features)
        embedded_captions = self.embedding(captions)
        lstm_output, lstm_hidden = self.lstm(embedded_captions, lstm_hidden)

        # Project features in a 'vocab_size'-dimensional space
        lstm_out_projected = torch.stack([self.linear(h) for h in lstm_output],
                                         0)
        probs = torch.stack([self.logsoftmax(h) for h in lstm_out_projected], 0)

        return probs, lstm_hidden

    def predict(self, features, go_tokens, num_step=1):
        lstm_input = go_tokens
        output_probs = []
        lstm_hidden = None

        for i in range(num_step):
            probs, lstm_hidden = self.apply_lstm(features, lstm_input,
                                                 lstm_hidden)

            output_probs.append(probs)
            # Greedy decoding
            _, preds = torch.max(probs, dim=2)

            lstm_input = preds

        concatenated_probs = torch.cat(output_probs, dim=1)
        return concatenated_probs

In [3]:
def open_annotation(path):
    if path.endswith("gz"):
        with gzip.open(path, "rb") as f:
            json = pd.read_json(f.read().decode("utf-8"))
    else:
        json = pd.read_json(path)
    return json

In [4]:
import numpy as np
import os
import pickle
import re

from collections import Counter

class Tokenizer(object):

    GO = "<GO>"
    END = "<END>"
    UNK = "<UNK>"

    def __init__(self, captions=None, user_maxlen=None, cutoff=0):
        """
            Build captions from all the expanded labels in all annotation files.
        Args:
            captions: list of paths to annotation files.
            user_maxlen: the maximum length of the captions set by the user.
        """

        self.maxlen = None if user_maxlen is None else user_maxlen
        self.cutoff = cutoff
        if captions:
            self.build_dictionaries(captions)

    def build_dictionaries(self, captions):
        """
            Builds two dictionaries: One that maps from tokens to ints, and
            another that maps from ints back to tokens.
        """

        maxlen = np.max([len(caption.split()) for caption in captions]) + 1

        self.set_maxlen(maxlen)

        print("\nBuilding dictionary for captions...")
        extra_tokens = [self.GO, self.END, self.UNK]
        tokens = [self.tokenize(p) for p in captions]
        tokens = [item for sublist in tokens for item in sublist]
        tokens = self.filter_tokens(tokens)
        all_tokens = extra_tokens + sorted(set(tokens))
        print("Number of different tokens: ", len(all_tokens))
        self.caption_dict = {k: idx for idx, k in enumerate(all_tokens)}
        self.inv_caption_dict = {idx: k for k, idx in self.caption_dict.items()}
        print(self.caption_dict)
        print(self.inv_caption_dict)

    def tokenize(self, caption):
        tokenize_regex = re.compile("[^A-Z\s]")
        return [x for x in tokenize_regex.sub(
            "", caption.upper()).split(" ") if x is not ""]

    def filter_tokens(self, tokens):
        count = Counter(tokens)
        return [token for token in count if count[token] > self.cutoff]

    def encode_caption(self, caption):

        tokenized_caption = self.tokenize(caption)
        if len(tokenized_caption) >= self.maxlen:
            tokenized_caption = tokenized_caption[0:self.maxlen - 1]
        encoded_caption = [self.encode_token(token)
                           for token in tokenized_caption]
        return self.pad_with_end(encoded_caption)

    def encode_token(self, token):
        return self.caption_dict[token] if token in self.caption_dict else \
            self.caption_dict[self.UNK]

    def decode_caption(self, indices):
        return [self.inv_caption_dict[index] for index in indices]

    def pad_with_end(self, encoded_caption):
        num_end = self.maxlen - len(encoded_caption)
        return encoded_caption + num_end * [self.caption_dict[self.END]]

    def get_vocab_size(self):
        return len(self.caption_dict)

    def get_string(self, predictions):
        output_tokens = self.decode_caption(predictions)
        if self.END in output_tokens:
            end_index = output_tokens.index(self.END)
        else:
            end_index = len(predictions)
        return " ".join(output_tokens[:end_index]).upper()

    def set_maxlen(self, maxlen):
        assert maxlen >= 1
        if self.maxlen is None:
            self.maxlen = maxlen
        else:
            self.maxlen = np.min([self.maxlen, maxlen])

    def load_dictionaries(self, path):
        with open(os.path.join(path, "tokenizer_dicts"), "rb") as f:
            (self.maxlen, self.caption_dict,
             self.inv_caption_dict) = pickle.load(f)

    def save_dictionaries(self, path):
        with open(os.path.join(path, "tokenizer_dicts"), "wb") as f:
            pickle.dump((self.maxlen, self.caption_dict,
                         self.inv_caption_dict), f)

In [112]:
class SequenceCrossEntropy(nn.Module):

    def __init__(self, loss=nn.NLLLoss):
        super(SequenceCrossEntropy, self).__init__()
        self.loss_function = loss()

    def forward(self, preds, target):
        batch_size, num_step, _ = preds.size()
        loss = 0.
        for t in range(num_step):
            loss += self.loss_function(preds[:, t], target[:, t])
        return loss / (batch_size*num_step)

In [113]:
annot = open_annotation('/data/20bn-somethingsomething/json/train_20171102.json.gz')

In [114]:
unique_templates = sorted(set(list(annot.template)))

In [115]:
len(unique_templates)

178

In [116]:
tokenizer = Tokenizer()

In [117]:
tokenizer.build_dictionaries(unique_templates)


Building dictionary for captions...
Number of different tokens:  232
{'STICK': 179, '<UNK>': 2, 'WET': 222, 'ON': 109, 'NEXT': 103, 'ALMOST': 8, 'DOWNWARDS': 53, 'SEPARATES': 148, 'MAKING': 97, 'UNFOLDING': 215, 'GETS': 75, 'UPRIGHT': 218, 'BUT': 30, 'SQUEEZING': 173, 'GLIDE': 76, 'AS': 15, 'PUT': 135, 'ROLL': 143, 'TABLE': 188, 'SHADOW': 149, 'NUMBER': 106, 'SLANTED': 154, 'SPILLING': 164, '<END>': 1, 'FALLING': 65, 'FOR': 72, 'BECAUSE': 21, 'AIR': 7, 'TILTING': 200, 'FAILING': 63, 'ONTO': 111, 'SLIDE': 155, 'APPROACHING': 12, 'HOLE': 82, 'QUICKLY': 137, 'PASS': 121, 'ONE': 110, 'SUPPORTED': 185, 'TWO': 211, 'IS': 86, 'TO': 202, 'PRETENDING': 132, 'ARE': 13, 'OPENING': 113, 'FEATHER': 67, 'THROWING': 199, 'UPWARDS': 220, 'ATTACH': 16, 'PILING': 126, 'COLLAPSING': 39, 'DIGGING': 49, 'NOTHING': 105, 'WITHOUT': 228, 'COMPLETELY': 44, 'THEY': 196, 'MANY': 98, 'COME': 42, 'SPRINKLING': 171, 'ENDS': 61, 'PHOTO': 122, 'PAPER': 119, 'SHOW': 150, 'PICK': 123, 'OPEN': 112, 'JUST': 89, 'ALONG':

In [118]:
tokenizer.tokenize(unique_templates[0])

tokenizer.encode_caption(unique_templates[1])

[17, 162, 202, 162, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [208]:
net = LSTMDecoder(256, 100, tokenizer.get_vocab_size(), 1, gpus=[0])

In [209]:
cls = 0

caption = tokenizer.encode_caption(unique_templates[cls])
caption = torch.from_numpy(np.array(caption)[None]).long()

features = cls * torch.ones((1, 1)).long()

In [210]:
caption = Variable(caption)
features = Variable(features)

In [211]:
#pred = net.forward(features, caption)

In [212]:
loss = SequenceCrossEntropy()

In [213]:
caption.size()

torch.Size([1, 18])

In [214]:
#l = loss(pred, caption)

In [215]:
from torch.optim import Adam

In [231]:
net = LSTMDecoder(256, 50, tokenizer.get_vocab_size(), 1, gpus=[0])

In [297]:
net2 = LSTMDecoder2(256, 178, tokenizer.get_vocab_size(), 1, gpus=[0])

In [298]:
# Get inputs
batch_caption = []
batch_features = []

for cls, template in enumerate(unique_templates):
    caption = tokenizer.encode_caption(template)
    caption = torch.from_numpy(np.array(caption)).long()
        
    one_hot = np.zeros(178, 'float32')
    one_hot[cls] = 1.
    features = torch.from_numpy(one_hot)
    # cls * torch.ones((1)).long()
    
    batch_caption.append(caption)
    batch_features.append(features)

In [299]:
batch_caption = torch.stack(batch_caption, dim=0)
batch_features = torch.stack(batch_features, dim=0)

# Convert to variable
batch_caption = Variable(batch_caption)
batch_features = Variable(batch_features)

In [300]:
net2 = net2.cuda()
loss = loss.cuda()

optimizer = Adam(net2.parameters(), lr=0.001)

batch_caption = batch_caption.cuda().long()
batch_features = batch_features.cuda()

In [301]:
def token_level_accuracy(captions, predictions, num_tokens=None):
    equal_values = captions[:, 0:num_tokens].eq(
        predictions[:, 0:num_tokens])
    accuracy = equal_values.float().mean().data.numpy()[0] * 100.0
    return accuracy

In [302]:
batch_caption

Variable containing:
   12   162   227  ...      1     1     1
   17   162   202  ...      1     1     1
   25   162   159  ...      1     1     1
       ...          ⋱          ...       
  162    41   227  ...      1     1     1
  162    65    95  ...      1     1     1
  162    65    95  ...      1     1     1
[torch.cuda.LongTensor of size 178x18 (GPU 0)]

In [307]:
num_epochs = 1000
valid_num = 5

for epoch in range(num_epochs):
        
    optimizer.zero_grad()
    net2.zero_grad()
    # Forward pass
    probs = net2.forward(batch_features, batch_caption, use_teacher_forcing=True)
    l = loss(probs, batch_caption)
    # Backward
    l.backward()
    optimizer.step()
        
    # Print
    l = l.cpu().data.numpy()[0]
    _, preds = torch.max(probs, dim=2)
    acc = token_level_accuracy(batch_caption.cpu(), preds.cpu())
    print('Train epoch {} - Loss = {} - Acc = {}'.format(epoch, l, acc))
    
    if epoch % valid_num == 0:
        pred = net2.forward(batch_features, batch_caption, use_teacher_forcing=False)
        l = loss(pred, batch_caption)    
        # Print
        l = l.cpu().data.numpy()[0]
        _, preds = torch.max(probs, dim=2)
        acc = token_level_accuracy(batch_caption.cpu(), preds.cpu())
        print('*' * 100)
        print('Valid epoch {} - Loss = {} - Acc = {}'.format(epoch, l, acc))
        print('*' * 100)
        
        if acc == 100.:
            break

Train epoch 0 - Loss = 0.00039788338472135365 - Acc = 99.25093650817871
****************************************************************************************************
Valid epoch 0 - Loss = 0.0022917857859283686 - Acc = 99.25093650817871
****************************************************************************************************
Train epoch 1 - Loss = 0.00039605548954568803 - Acc = 99.25093650817871
Train epoch 2 - Loss = 0.0003942335315514356 - Acc = 99.25093650817871
Train epoch 3 - Loss = 0.0003924175980500877 - Acc = 99.25093650817871
Train epoch 4 - Loss = 0.0003906078345607966 - Acc = 99.25093650817871
Train epoch 5 - Loss = 0.00038880392094142735 - Acc = 99.25093650817871
****************************************************************************************************
Valid epoch 5 - Loss = 0.0022651341278105974 - Acc = 99.25093650817871
****************************************************************************************************
Train epoch 6 - Loss = 0.

****************************************************************************************************
Valid epoch 65 - Loss = 0.001530561363324523 - Acc = 99.59425926208496
****************************************************************************************************
Train epoch 66 - Loss = 0.0002904685970861465 - Acc = 99.59425926208496
Train epoch 67 - Loss = 0.0002890459727495909 - Acc = 99.59425926208496
Train epoch 68 - Loss = 0.00028762928559444845 - Acc = 99.59425926208496
Train epoch 69 - Loss = 0.00028621856472454965 - Acc = 99.59425926208496
Train epoch 70 - Loss = 0.00028481357730925083 - Acc = 99.59425926208496
****************************************************************************************************
Valid epoch 70 - Loss = 0.0015243350062519312 - Acc = 99.59425926208496
****************************************************************************************************
Train epoch 71 - Loss = 0.00028341446886770427 - Acc = 99.59425926208496
Train epoch 72 - 

Train epoch 130 - Loss = 0.00021075787662994117 - Acc = 99.81273412704468
****************************************************************************************************
Valid epoch 130 - Loss = 0.0006970902322791517 - Acc = 99.81273412704468
****************************************************************************************************
Train epoch 131 - Loss = 0.00020969321485608816 - Acc = 99.84394311904907
Train epoch 132 - Loss = 0.00020863373356405646 - Acc = 99.84394311904907
Train epoch 133 - Loss = 0.00020757952006533742 - Acc = 99.84394311904907
Train epoch 134 - Loss = 0.00020653055980801582 - Acc = 99.87515807151794
Train epoch 135 - Loss = 0.00020548691099975258 - Acc = 99.87515807151794
****************************************************************************************************
Valid epoch 135 - Loss = 0.0004896358586847782 - Acc = 99.87515807151794
****************************************************************************************************
Train 

Train epoch 193 - Loss = 0.00015313691983465105 - Acc = 99.93757605552673
Train epoch 194 - Loss = 0.00015236616309266537 - Acc = 99.93757605552673
Train epoch 195 - Loss = 0.00015159934991970658 - Acc = 99.93757605552673
****************************************************************************************************
Valid epoch 195 - Loss = 0.0002728379622567445 - Acc = 99.93757605552673
****************************************************************************************************
Train epoch 196 - Loss = 0.00015083637845236808 - Acc = 99.93757605552673
Train epoch 197 - Loss = 0.00015007768524810672 - Acc = 99.93757605552673
Train epoch 198 - Loss = 0.00014932321209926158 - Acc = 99.93757605552673
Train epoch 199 - Loss = 0.00014857282803859562 - Acc = 99.93757605552673
Train epoch 200 - Loss = 0.00014782653306610882 - Acc = 99.93757605552673
****************************************************************************************************
Valid epoch 200 - Loss = 0.00026

In [247]:
net2.use_cuda

True

In [304]:
path = "/home/farzaneh/PycharmProjects/pretrained_nets/jester_net_on_smtsmt_20171031/model.checkpoint"

In [305]:
s = torch.load(path)

In [306]:
s

OrderedDict([('logsoftmax.linear.weight', 
              -5.9498e+00 -1.6424e-01 -4.5323e+00  ...  -4.1166e-02 -2.7420e+00 -4.3519e+00
              -1.0498e+00 -1.2226e-01 -6.0664e+00  ...  -2.9235e-02 -2.8050e-01 -9.1082e-01
              -1.5031e+00 -1.6010e-02 -7.6805e+00  ...  -1.7197e-02 -1.0701e+00 -1.5469e+00
                              ...                   ⋱                   ...                
              -1.2371e+00 -1.6942e-01 -4.2709e+00  ...  -6.0752e-04 -9.3195e-01 -1.6218e+00
              -2.7794e+00 -1.6826e-01 -1.7114e+00  ...  -4.4442e-02 -8.1058e-01 -3.2101e-01
              -1.6837e+00 -1.9701e-01 -8.8867e-01  ...   2.2288e-03 -1.4974e+00 -6.3800e-01
              [torch.cuda.FloatTensor of size 178x1024 (GPU 0)]),
             ('logsoftmax.linear.bias', 
              -0.1786
              -0.1345
               0.1546
               0.1494
              -0.0258
              -0.2088
               0.2472
              -0.1185
              -0.2913
        