In [31]:
import json
import os
import argparse
import torch
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from models import EncoderRNN, DecoderRNN, S2VTAttModel, S2VTModel
from dataloader import VideoDataset
import misc.utils as utils
from misc.cocoeval import suppress_stdout_stderr, COCOScorer

from pandas.io.json import json_normalize
import pandas as pd

In [32]:
args = {'recover_opt': 'data/save/opt_info.json', 'saved_model': 'data/save/model_50.pth', 'dump_json': 1, 'results_path': 'results/', 'dump_path': 0, 'gpu': '0', 'batch_size': 25, 'sample_max': 1, 'temperature': 1.0, 'beam_size': 1}


In [33]:
opt = json.load(open(args["recover_opt"]))
for k, v in args.items():
        opt[k] = v
os.environ['CUDA_VISIBLE_DEVICES'] = opt["gpu"]

In [34]:
dataset = VideoDataset(opt, "test")

vocab size is  16860
number of train videos:  6501
number of val videos:  500
number of test videos:  2999
load feats from ['data/feats/resnet152']
max sequence length in data is 28


In [35]:
opt["vocab_size"] = dataset.get_vocab_size()

In [36]:
opt["seq_length"] = dataset.max_len

In [37]:
opt["model"]

'S2VTAttModel'

In [38]:
encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]);
decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]);
model = S2VTAttModel(encoder, decoder).cuda()

  "num_layers={}".format(dropout, num_layers))


In [39]:
model.load_state_dict(torch.load(opt["saved_model"]))

<All keys matched successfully>

In [40]:
crit = utils.LanguageModelCriterion()



In [41]:
model.encoder.rnn.bidirectional = bool(model.encoder.rnn.bidirectional)

In [42]:
vocab = dataset.get_vocab()

In [43]:
model.eval()

S2VTAttModel(
  (encoder): EncoderRNN(
    (vid2hid): Linear(in_features=2048, out_features=512, bias=True)
    (input_dropout): Dropout(p=0.2, inplace=False)
    (rnn): GRU(512, 512, batch_first=True, dropout=0.5)
  )
  (decoder): DecoderRNN(
    (input_dropout): Dropout(p=0.2, inplace=False)
    (embedding): Embedding(16860, 512)
    (attention): Attention(
      (linear1): Linear(in_features=1024, out_features=512, bias=True)
      (linear2): Linear(in_features=512, out_features=1, bias=False)
    )
    (rnn): GRU(1024, 512, batch_first=True, dropout=0.5)
    (out): Linear(in_features=512, out_features=16860, bias=True)
  )
)

In [44]:
loader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True)

In [45]:
scorer = COCOScorer()

init COCO-EVAL scorer


In [46]:
gt_dataframe = json_normalize(
        json.load(open(opt["input_json"]))['sentences'])

In [47]:
def convert_data_to_coco_scorer_format(data_frame):
    gts = {}
    for row in zip(data_frame["caption"], data_frame["video_id"]):
        if row[1] in gts:
            gts[row[1]].append(
                {'image_id': row[1], 'cap_id': len(gts[row[1]]), 'caption': row[0]})
        else:
            gts[row[1]] = []
            gts[row[1]].append(
                {'image_id': row[1], 'cap_id': len(gts[row[1]]), 'caption': row[0]})
    return gts

In [48]:
gts = convert_data_to_coco_scorer_format(gt_dataframe)

In [49]:
data = next(iter(loader))

In [50]:
data['labels'].shape

torch.Size([25, 28])

In [51]:
sents_label = [' '.join([e for e in [vocab.get(str(key.data.tolist())) for key in data['labels'][ind]] if e not in ('<eos>', '<sos>')]) for ind in range(25)] 

In [52]:
# forward the model to get loss
fc_feats = data['fc_feats'].cuda()
labels = data['labels'].cuda()
masks = data['masks'].cuda()
video_ids = data['video_ids']

In [53]:
# forward the model to also get generated samples for each image
with torch.no_grad():
    seq_probs, seq_preds = model(
        fc_feats, mode='inference', opt=opt)



In [54]:
sents = utils.decode_sequence(vocab, seq_preds)

In [69]:
sample_result = [video_ids, sents, sents_label]

In [77]:
pd.options.display.max_colwidth = 150
df_result = pd.DataFrame(sample_result).T

In [78]:
df_result.columns = ['id', 'inference', 'label']

In [79]:

df_result

Unnamed: 0,id,inference,label
0,video7869,a man is showing how to use a rubicks cube,french man is adjusting a phone situated on a selfie stick
1,video7342,a hamster is playing with a hamster,the black mice is trying to exit from his cage
2,video8797,a man is playing a video game,a person shoots an arrow in a video game
3,video7709,a man is cutting a piece of food,a men is cooking food he is adding some ingredients in untensil which is kept on fire
4,video9905,a minecraft character is running,a minecraft demonstration video
5,video9903,a man is playing a video game,the cat jump on the dining table and spill all the food items
6,video8143,a man is playing a game,a girl is performing a gymnastics routine
7,video7505,a man is swimming in the water,a surfer describing what was dragging him under water
8,video8552,a man is talking about a woman s face,a man pouring bottles of water on himself
9,video7416,a man is playing a video game,a game of soccer


In [235]:
gt_dataframe.loc[gt_dataframe.index[gt_dataframe['video_id'] == 'video7764']]

Unnamed: 0,caption,sen_id,video_id
153760,special effects make it look like a man s face...,153760,video7764
153761,there is a man standing with firing body,153761,video7764
153762,a man is starring on the screen,153762,video7764
153763,opening credits play to the television show tr...,153763,video7764
153764,black backgrounds with images of men s faces h...,153764,video7764
153765,a man s face with a worried look is consumed b...,153765,video7764
153766,a house with angled roofing is over a man s no...,153766,video7764
153767,a person is standing still and some fire are b...,153767,video7764
153768,the introduction video for a tv show is played,153768,video7764
153769,a mans face with fire across it and another ma...,153769,video7764
