In [1]:
import json
import os
import argparse
import torch
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from models import EncoderRNN, DecoderRNN, S2VTAttModel, S2VTModel
from dataloader import VideoDataset
import misc.utils as utils
from misc.cocoeval import suppress_stdout_stderr, COCOScorer
import numpy as np

from pandas.io.json import json_normalize
import pandas as pd

In [2]:
args = {'recover_opt': 'data/save/opt_info.json', 'saved_model': 'data/save/model_50.pth', 'dump_json': 1, 'results_path': 'results/', 'dump_path': 0, 'gpu': '0', 'batch_size': 25, 'sample_max': 1, 'temperature': 1.0, 'beam_size': 1}


In [3]:
opt = json.load(open(args["recover_opt"]))
for k, v in args.items():
        opt[k] = v
os.environ['CUDA_VISIBLE_DEVICES'] = opt["gpu"]

In [4]:
dataset = VideoDataset(opt, "test")

vocab size is  16860
number of train videos:  6501
number of val videos:  500
number of test videos:  2999
load feats from ['data/feats/resnet152']
max sequence length in data is 28


In [5]:
opt["vocab_size"] = dataset.get_vocab_size()

In [6]:
opt["seq_length"] = dataset.max_len

In [7]:
opt["model"]

'S2VTAttModel'

In [8]:
encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]);
decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]);
model = S2VTAttModel(encoder, decoder).cuda()

  "num_layers={}".format(dropout, num_layers))


In [9]:
model.load_state_dict(torch.load(opt["saved_model"]))

<All keys matched successfully>

In [10]:
crit = utils.LanguageModelCriterion()



In [11]:
model.encoder.rnn.bidirectional = bool(model.encoder.rnn.bidirectional)

In [12]:
vocab = dataset.get_vocab()

In [13]:
model.eval()

S2VTAttModel(
  (encoder): EncoderRNN(
    (vid2hid): Linear(in_features=2048, out_features=512, bias=True)
    (input_dropout): Dropout(p=0.2, inplace=False)
    (rnn): GRU(512, 512, batch_first=True, dropout=0.5)
  )
  (decoder): DecoderRNN(
    (input_dropout): Dropout(p=0.2, inplace=False)
    (embedding): Embedding(16860, 512)
    (attention): Attention(
      (linear1): Linear(in_features=1024, out_features=512, bias=True)
      (linear2): Linear(in_features=512, out_features=1, bias=False)
    )
    (rnn): GRU(1024, 512, batch_first=True, dropout=0.5)
    (out): Linear(in_features=512, out_features=16860, bias=True)
  )
)

In [14]:
# loader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True)

In [15]:
loader = DataLoader(dataset, batch_size=1, shuffle=True)

In [16]:
scorer = COCOScorer()

init COCO-EVAL scorer


In [17]:
gt_dataframe = json_normalize(
        json.load(open(opt["input_json"]))['sentences'])

In [18]:
def convert_data_to_coco_scorer_format(data_frame):
    gts = {}
    for row in zip(data_frame["caption"], data_frame["video_id"]):
        if row[1] in gts:
            gts[row[1]].append(
                {'image_id': row[1], 'cap_id': len(gts[row[1]]), 'caption': row[0]})
        else:
            gts[row[1]] = []
            gts[row[1]].append(
                {'image_id': row[1], 'cap_id': len(gts[row[1]]), 'caption': row[0]})
    return gts

In [19]:
gts = convert_data_to_coco_scorer_format(gt_dataframe)

In [20]:
data = next(iter(loader))

In [21]:
data['labels'].shape

torch.Size([1, 28])

In [22]:
sents_label = [' '.join([e for e in [vocab.get(str(key.data.tolist())) for key in data['labels'][ind]] if e not in ('<eos>', '<sos>')]) for ind in range(data['labels'].shape[0])] 

In [23]:
feats_dir = 'data/feats/demo'

In [24]:
ix = 2211
fc_feat = []
fc_feat.append(np.load(os.path.join(feats_dir, 'video%i.npy' % (ix))))
fc_feat = np.concatenate(fc_feat, axis=1)
fc_feat = fc_feat[None, :, :]

In [25]:
fc_feat=torch.from_numpy(fc_feat).type(torch.FloatTensor)

In [26]:
fc_feat.shape

torch.Size([1, 40, 2048])

In [27]:
fc_feat = fc_feat.cuda()

In [28]:
# forward the model to also get generated samples for each image
with torch.no_grad():
    seq_probs, seq_preds = model(
        fc_feat, mode='inference', opt=opt)



In [29]:
sents = utils.decode_sequence(vocab, seq_preds)

In [30]:
video_ids = 'video'+str(ix)
sample_result = [video_ids, sents]

In [31]:
pd.options.display.max_colwidth = 150
df_result = pd.DataFrame(sample_result).T

In [32]:
df_result.columns = ['id', 'inference']

In [33]:

df_result

Unnamed: 0,id,inference
0,video2211,[a man is talking about a movie]


In [34]:
gt_dataframe.loc[gt_dataframe.index[gt_dataframe['video_id'] == video_ids]]

Unnamed: 0,caption,video_id,sen_id
155800,a man in suit is seated and talking a teacher is teaching in a classroom and students are taking down notes along with their laptops,video2211,155800
155801,a man with black suit talking in front of the camera,video2211,155801
155802,a man is answering to the questions related the work,video2211,155802
155803,there is a women in pink interiew a well designated man,video2211,155803
155804,two people in suit dress talking each other very seriously,video2211,155804
155805,a man in a suit discusses work within the cyber security and software engineering fields,video2211,155805
155806,woman interview the man and employees working in the office,video2211,155806
155807,a bald man wearing a suit speaks while sitting in an armchair,video2211,155807
155808,a man looking and talking with black color,video2211,155808
155809,guy dressed in formal wear answering to another guy,video2211,155809
