In [1]:
import json
import os
import argparse
import torch
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from models import EncoderRNN, DecoderRNN, S2VTAttModel, S2VTModel
from dataloader import VideoDataset
import misc.utils as utils
from misc.cocoeval import suppress_stdout_stderr, COCOScorer

from pandas.io.json import json_normalize

In [2]:
args = {'recover_opt': 'data/save/opt_info.json', 'saved_model': 'data/save/model_50.pth', 'dump_json': 1, 'results_path': 'results/', 'dump_path': 0, 'gpu': '0', 'batch_size': 25, 'sample_max': 1, 'temperature': 1.0, 'beam_size': 1}


In [3]:
opt = json.load(open(args["recover_opt"]))
for k, v in args.items():
        opt[k] = v
os.environ['CUDA_VISIBLE_DEVICES'] = opt["gpu"]

In [4]:
dataset = VideoDataset(opt, "test")

vocab size is  16860
number of train videos:  6501
number of val videos:  500
number of test videos:  2999
load feats from ['data/feats/resnet152']
max sequence length in data is 28


In [5]:
opt["vocab_size"] = dataset.get_vocab_size()

In [6]:
opt["seq_length"] = dataset.max_len

In [7]:
opt["model"]

'S2VTAttModel'

In [8]:
encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]);
decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]);
model = S2VTAttModel(encoder, decoder).cuda()

  "num_layers={}".format(dropout, num_layers))


In [9]:
model.load_state_dict(torch.load(opt["saved_model"]))

<All keys matched successfully>

In [10]:
crit = utils.LanguageModelCriterion()



In [11]:
model.encoder.rnn.bidirectional = bool(model.encoder.rnn.bidirectional)

In [12]:
vocab = dataset.get_vocab()

In [14]:
model.eval()

S2VTAttModel(
  (encoder): EncoderRNN(
    (vid2hid): Linear(in_features=2048, out_features=512, bias=True)
    (input_dropout): Dropout(p=0.2, inplace=False)
    (rnn): GRU(512, 512, batch_first=True, dropout=0.5)
  )
  (decoder): DecoderRNN(
    (input_dropout): Dropout(p=0.2, inplace=False)
    (embedding): Embedding(16860, 512)
    (attention): Attention(
      (linear1): Linear(in_features=1024, out_features=512, bias=True)
      (linear2): Linear(in_features=512, out_features=1, bias=False)
    )
    (rnn): GRU(1024, 512, batch_first=True, dropout=0.5)
    (out): Linear(in_features=512, out_features=16860, bias=True)
  )
)

In [15]:
loader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True)

In [17]:
scorer = COCOScorer()

init COCO-EVAL scorer


In [18]:
gt_dataframe = json_normalize(
        json.load(open(opt["input_json"]))['sentences'])

In [21]:
def convert_data_to_coco_scorer_format(data_frame):
    gts = {}
    for row in zip(data_frame["caption"], data_frame["video_id"]):
        if row[1] in gts:
            gts[row[1]].append(
                {'image_id': row[1], 'cap_id': len(gts[row[1]]), 'caption': row[0]})
        else:
            gts[row[1]] = []
            gts[row[1]].append(
                {'image_id': row[1], 'cap_id': len(gts[row[1]]), 'caption': row[0]})
    return gts

In [22]:
gts = convert_data_to_coco_scorer_format(gt_dataframe)

In [27]:
print(loader)

<torch.utils.data.dataloader.DataLoader object at 0x7f6d6ef8e908>


In [56]:
data = next(iter(loader))

In [57]:
xdata['labels'].shape

torch.Size([25, 28])

In [58]:
# forward the model to get loss
fc_feats = data['fc_feats'].cuda()
labels = data['labels'].cuda()
masks = data['masks'].cuda()
video_ids = data['video_ids']

In [66]:
labels.shape

torch.Size([25, 28])

In [60]:
# forward the model to also get generated samples for each image
with torch.no_grad():
    seq_probs, seq_preds = model(
        fc_feats, mode='inference', opt=opt)



In [61]:
sents = utils.decode_sequence(vocab, seq_preds)

In [71]:
video_ids

['video8680',
 'video8334',
 'video9525',
 'video8070',
 'video7328',
 'video8052',
 'video8675',
 'video9433',
 'video7620',
 'video7963',
 'video7715',
 'video8565',
 'video8844',
 'video9121',
 'video7742',
 'video7974',
 'video9780',
 'video9555',
 'video7514',
 'video8163',
 'video8846',
 'video9756',
 'video7708',
 'video7153',
 'video8125']

In [69]:
sents

['a woman is talking about a movie',
 'a cartoon character is flying',
 'a man is talking about a fish',
 'a man is driving a car in a video game',
 'a cartoon character is flying a sword',
 'a man is dancing',
 'a soccer player celebrates',
 'a woman is talking about a movie',
 'a woman is talking about a movie',
 'a man is working with a machine',
 'a woman is singing',
 'a man is talking about a video game',
 'a woman is dancing',
 'a man is jumping on a trampoline',
 'a man is showing how to use a computer',
 'a woman is talking about a movie',
 'a woman is talking about a movie',
 'a man is singing a song',
 'a man is talking about a video game',
 'a man is talking about a movie',
 'a man is swimming in a pool',
 'a woman is talking about a movie',
 'a cartoon of a boy and a girl are talking',
 'a man is playing basketball',
 'a man is talking about a video game']

In [82]:
gt_dataframe.loc[gt_dataframe.index[gt_dataframe['video_id'] == 'video8125']]

Unnamed: 0,caption,sen_id,video_id
119100,a drone is crashed in the bushes,119100,video8125
119101,a man crashed on a human sized paper airplane,119101,video8125
119102,a man crashes a personal aircraft,119102,video8125
119103,a man crashes into the ground,119103,video8125
119104,a man crashes when he flies a remote flyer,119104,video8125
119105,a man hanging from his flying drone,119105,video8125
119106,a man in a flying contraption crashes in a field,119106,video8125
119107,a man is crashing,119107,video8125
119108,a man is getting carried away by something,119108,video8125
119109,a man is landing with a radio,119109,video8125
