#  Captioning Demo

This notebook shows how to load and use **Fully Convolutional Net** from **rtorchn**.

The network implementation can be found [here](https://github.com/TwentyBN/20bn-rtorchn/blob/master/src/main/python/rtorchn/core/networks/smtsmt.py).
    

When trained on something-something (version: 27/06/2017), this network achieves **~30% test accuracy**.

### Imports

In [387]:
import io
import base64
import glob
import os

import gzip
import json
import numpy as np
import torch
from IPython.display import HTML
from skvideo.io import FFmpegReader, ffprobe, vwrite
from torch.autograd import Variable

### Tool to deal with mpeg videos

In [435]:
def show_video(video_filenames):
    """
    Tool to display videos inside the notebook.
    """
    
    if type(video_filenames) is not list:
        video_filenames = [video_filenames]
    
    html_code = ''
    for filename in video_filenames:
        video = io.open(filename, 'r+b').read()
        encoded = base64.b64encode(video)
        html_code += '''
        <video alt="test" width="640" height="480" controls>
        <source src="data:video/mp4;base64,{0}" type="video/mp4" />
        </video>
        '''.format(encoded.decode('ascii'))
        
    return HTML(data= html_code)


def open_mpeg_video(fname, framerate, size):
    """
    Open an mpeg video, and return it as a numpy array.
    """
    
    metadata = ffprobe(fname)
    duration = float(metadata['video']['@duration'])
    # Compute corresponding nb of frames
    nframes = int(duration * framerate)
    oargs = {
        "-r": "%d" % framerate,
        "-vframes": "%d" % nframes,
        "-s": "%dx%d" % (size[0], size[1])
    }
    # Open file
    reader = FFmpegReader(fname, inputdict={}, outputdict=oargs)
    video = []
    # Get frames until there is no more
    for frame in reader.nextFrame():
        video.append(frame)
    # Return as a numpy array
    return np.array(video)

### Mapping Class index <-> Class string

## A) Model Instantiation

All **rtorchn** models can be imported from **rtorchn.core.networks**:

In [436]:
from ptcap.model.captioners import EncoderDecoder
from ptcap.model.encoders import CNN3dLSTMEncoder
from ptcap.model.pretrained_encoders import FCEncoder, JesterEncoder, BIJesterEncoder
from ptcap.model.decoders import LSTMDecoder

#net = FullyConvolutionalNet(num_classes=178)jester1024_cutoff_300_ssssssss/

net = EncoderDecoder(
        encoder=BIJesterEncoder,
        decoder=LSTMDecoder,
        encoder_kwargs={"freeze": False},#, "pretrained_path": "/home/farzaneh/PycharmProjects/pretrained_nets/fully_conv_net_on_smtsmt_20170627/model.checkpoint"},
        decoder_kwargs={"embedding_size": 256, "go_token": 0, "gpus": [0], "hidden_size": 1024, "num_lstm_layers": 1, 
        "vocab_size": 2100}, 
        gpus=[0]).cuda()

gpus: [0]
****************************************************************************************************
()
{'freeze': False}
()
{'gpus': [0], 'go_token': 0, 'hidden_size': 1024, 'vocab_size': 2100, 'embedding_size': 256, 'num_lstm_layers': 1}
&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&


### Crash test

In [437]:
# print('Default input size:', net.default_input_size())
# # Create a dummy video
# video = np.zeros(net.default_input_size(), "float32")
# # Get prediction
# pred = net.forward(Variable(torch.from_numpy(video)))
# print('Prediction shape:', pred.size())

### Total number of parameters

In [438]:
num_params = sum([np.prod(param.size()) 
                  for param in net.parameters()])
print('Total number of parameters: {}'.format(num_params))

Total number of parameters: 15979202


## B) Loading weights

Send me a message if you want these weights

In [456]:
path = '/home/farzaneh/PycharmProjects/pytorch-captioning/results/jester1024_cutoff_10_unfrozen_lbl/'
checkpoint = torch.load(path + '/model.best')
net.load_state_dict(checkpoint["model"])

## C) Load Tokenizer

In [457]:
from ptcap.data.tokenizer import Tokenizer

tokenizer = Tokenizer(user_maxlen=13)
tokenizer.load_dictionaries(path)

## C) Testing 

In [458]:
TMP_VIDEO_FILENAME = 'tmp.mp4'

def unpreprocess(video):
    video = video.data.numpy()[0]
    video = 64. * video.transpose(1, 2, 3, 0)
    return np.array(video, 'uint8')

def demo(net, preprocessor, filename, top_n=5):
    # Open mpeg file and get a numpy array
    video_uint8 = open_mpeg_video(filename, 12, [128, 128])
    # Preprocessing
    video = preprocessor(video_uint8)
    # Convert to torch variable
    video = Variable(torch.from_numpy(video[None]), volatile=True).cuda()
    empty_caption = Variable(torch.zeros([1, 13]), volatile=True).long().cuda()
    
    # Compute predictions
    pred = net.forward((video, empty_caption), use_teacher_forcing=False)
    # Convert to numpy 
    pred = np.exp(pred.cpu().data.numpy())[0]
    # Class index
    pred_argmax = np.argmax(pred, axis=1)
    decoded_pred = tokenizer.decode_caption(pred_argmax)
    print(decoded_pred)
    # Print class name with proba
    # Save input video in tmp file
    vwrite(TMP_VIDEO_FILENAME, unpreprocess(video.cpu()))
    return show_video([TMP_VIDEO_FILENAME, filename])

def path_generator(annotation_path, root_path):
    with gzip.open(annotation_path, "rt") as f:
        annotations = json.load(f)
    files = [elem['file'] for elem in annotations]
    labels = [elem['label'] for elem in annotations]
    return ((os.path.join(root_path, f), label) for f,label in zip(files, labels))

In [459]:
# Path generator
path_gen = path_generator('/data/20bn-somethingsomething/json/test_20170929.json.gz', 
                          '/data/20bn-somethingsomething/videos')
# Put the network in evaluation mode
_ = net.eval()

#### Preprocessor

In [460]:
from rtorchn.data.preprocessing import default_evaluation_preprocesser

preprocessor = default_evaluation_preprocesser([48, 96, 96], 64.)

#### Ctrl + Enter

In [461]:
path_to_video = '/data/20bn-somethingsomething/videos/1280777/large_784c5d1a26c362a6d62e.mp4'
label = 'pretending to pick pencil up'
print('Label: {}\n{}'.format(label, 65*'-'))
demo(net, preprocessor, path_to_video)

Label: pretending to pick pencil up
-----------------------------------------------------------------
['PRETENDING', 'TO', 'PICK', 'A', 'PLASTIC', 'BOTTLE', 'UP', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>']


In [462]:
path_to_video = '/data/20bn-somethingsomething/videos/68388/large_70e5a95d0cdb621ecae9.mp4'
label = 'pouring water into cup'

print('Label: {}\n{}'.format(label, 65*'-'))
demo(net, preprocessor, path_to_video)

Label: pouring water into cup
-----------------------------------------------------------------
['POURING', 'WATER', 'INTO', 'GLASS', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>']


In [463]:
path_to_video = '/data/20bn-somethingsomething/videos/1152240/large_e5f44f4114aeb04e5b31.mp4'
label = 'moving a pencil and a pencil closer to each other'

print('Label: {}\n{}'.format(label, 65*'-'))
demo(net, preprocessor, path_to_video)

Label: moving a pencil and a pencil closer to each other
-----------------------------------------------------------------
['MOVING', 'A', 'PEN', 'AND', 'A', 'PEN', 'CLOSER', 'TO', 'EACH', 'OTHER', '<END>', '<END>', '<END>']


In [464]:
path_to_video = '/data/20bn-somethingsomething/videos/668717/large_eca1e613437ef0ba7aab.mp4'
label = 'taking the phone out of the box'

print('Label: {}\n{}'.format(label, 65*'-'))
demo(net, preprocessor, path_to_video)

Label: taking the phone out of the box
-----------------------------------------------------------------
['UNFOLDING', 'PAPER', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>']


In [465]:
path_to_video

'/data/20bn-somethingsomething/videos/668717/large_eca1e613437ef0ba7aab.mp4'

In [402]:
path_to_video = '/data/20bn-somethingsomething/videos/36742/large_123fc4c216d498b661ec.mp4'
label = 'piling books up'

print('Label: {}\n{}'.format(label, 65*'-'))
demo(net, preprocessor, path_to_video)

Label: piling books up
-----------------------------------------------------------------
['TWISTING', 'A', 'BOTTLE', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>']


In [403]:
path_to_video = '/data/20bn-somethingsomething/videos/43495/large_28270fd5892ec7024093.mp4'
label = 'piling books up'

print('Label: {}\n{}'.format(label, 65*'-'))
demo(net, preprocessor, path_to_video)

Label: piling books up
-----------------------------------------------------------------
['PULLING', 'TWO', 'ENDS', 'OF', 'A', 'RUBBER', 'SO', 'THAT', 'IT', 'GETS', 'STRETCHED', '<END>', '<END>']


In [472]:
path_to_video, label = next(path_gen)

print('Label: {}\n{}'.format(label, 65*'-'))
demo(net, preprocessor, path_to_video)

Label: pretending to pour Cereal  out of Box, but Box is empty
-----------------------------------------------------------------
['PUTTING', '<UNK>', 'INTO', 'PLASTIC', 'BASKET', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>']


## D) Feature Extraction

You can use the `extract_features` method to extract the activation values just before the output layers:

In [42]:
# Create a dummy video
video = np.zeros(net.default_input_size(), "float32")
# Get prediction
features = net.extract_features(Variable(torch.from_numpy(video)))
print('Prediction shape:', features.size())

AttributeError: 'EncoderDecoder' object has no attribute 'default_input_size'

## E) Finetuning

In [24]:
# Coming soon