#  Evaluation Demo

### Imports

In [88]:
import io
import base64
import glob
import os

import gzip
import json
import numpy as np
import torch
from IPython.display import HTML
from skvideo.io import FFmpegReader, ffprobe, vwrite
from torch.autograd import Variable
from ptcap.trainers import DataParallelWrapper
from ptcap.scores import ( caption_accuracy, first_token_accuracy, token_accuracy)
from ptcap.data.annotation_parser import JsonParser
from collections import OrderedDict
from collections import Counter, namedtuple



In [89]:
stop_words = ['A', 'AN', 'THE', '<END>']

def safe_div(x,y):
    if y == 0:
        return 0
    return x / y
def fscore(precision, recall, beta=1):
    numerator = (1.0 + (beta ** 2)) * precision * recall
    denominator = ((beta ** 2) * precision) + recall
    return {"fscore": safe_div(numerator, denominator)}
class LCS(object):
    """
    The main functionality of this class is to compute the LCS (Lowest Common
    Subsequence) between a caption and prediction. By default, it returns the
    precision and recall values calculated based on the LCS between a prediction
    and a caption.
    """
    def __init__(self, functions_list, tokenizer):
        """
        Initializes functions_list and tokenizer.
        Args:
        functions_list: A list of the functions that will be applied on the
        precision and recall values calculated based on the LCS between a
        prediction and a caption.
        """

        self.functions_list = functions_list
        self.scores_container = OrderedDict()
        self.scores_dict = OrderedDict()
        self.tokenizer = tokenizer

    def __call__(self, outputs):
        string_predictions = [self.tokenizer.get_string(str_pred.data.numpy())
                              for str_pred in outputs.predictions]
        return self.score_batch(string_predictions, outputs.string_captions)

    def collect_scores(self, batch_scores_dict, scores_dict):
        for metric, metric_value in scores_dict.items():
            if metric not in batch_scores_dict:
                batch_scores_dict[metric] = [metric_value]
            else:
                batch_scores_dict[metric].append(metric_value)
        return batch_scores_dict

    @classmethod
    def compute_lcs(cls, prediction, caption):
        num_rows = len(prediction)
        num_cols = len(caption)

        table = [[0] * (num_cols + 1) for _ in range(num_rows + 1)]
        for i in range(1, num_rows + 1):
            for j in range(1, num_cols + 1):
                if prediction[i - 1] == caption[j - 1]:
                    table[i][j] = table[i - 1][j - 1] + 1
                else:
                    table[i][j] = max(table[i][j - 1], table[i - 1][j])
        return table, table[num_rows][num_cols]

    def mean_scores(self, batch_scores_dict):
        for metric, metric_value in batch_scores_dict.items():
            batch_scores_dict[metric] = np.mean(metric_value)
        return batch_scores_dict

    def score_batch(self, predictions, captions):
        assert len(predictions) == len(captions)

        batch_scores_dict = OrderedDict()
        for count, (prediction, caption) in enumerate(zip(predictions,
                                                          captions)):
            scores_dict = self.score_sample(prediction.split(), caption.split())
            batch_scores_dict = self.collect_scores(batch_scores_dict,
                                                    scores_dict)

        batch_scores_dict = self.mean_scores(batch_scores_dict)
        return batch_scores_dict

    def score_sample(self, prediction, caption):
        scores_dict = OrderedDict()
        _, lcs_score = self.compute_lcs(prediction, caption)
        scores_dict["precision"] = safe_div(lcs_score, len(prediction))
        scores_dict["recall"] = safe_div(lcs_score, len(caption))

        for score_function in self.functions_list:
            scores_dict.update(score_function(scores_dict["precision"],
                                              scores_dict["recall"]))

        return scores_dict

### Tool to deal with mpeg videos

In [90]:
def show_video(video_filenames):
    """
    Tool to display videos inside the notebook.
    """
    
    if type(video_filenames) is not list:
        video_filenames = [video_filenames]
    
    html_code = ''
    for filename in video_filenames:
        video = io.open(filename, 'r+b').read()
        encoded = base64.b64encode(video)
        html_code += '''
        <video alt="test" width="640" height="480" controls>
        <source src="data:video/mp4;base64,{0}" type="video/mp4" />
        </video>
        '''.format(encoded.decode('ascii'))
        
    return HTML(data= html_code)


def open_mpeg_video(fname, framerate, size):
    """
    Open an mpeg video, and return it as a numpy array.
    """
    
    metadata = ffprobe(fname)
    duration = float(metadata['video']['@duration'])
    # Compute corresponding nb of frames
    nframes = int(duration * framerate)
    oargs = {
        "-r": "%d" % framerate,
        "-vframes": "%d" % nframes,
        "-s": "%dx%d" % (size[0], size[1])
    }
    # Open file
    reader = FFmpegReader(fname, inputdict={}, outputdict=oargs)
    video = []
    # Get frames until there is no more
    for frame in reader.nextFrame():
        video.append(frame)
    # Return as a numpy array
    return np.array(video)

## A) Model Instantiation

In [91]:
from ptcap.model.captioners import EncoderDecoder
from ptcap.model.encoders import CNN3dLSTMEncoder
from ptcap.model.pretrained_encoders import FCEncoder, JesterEncoder, BIJesterEncoder
from ptcap.model.decoders import LSTMDecoder, CoupledLSTMDecoder

#net = FullyConvolutionalNet(num_classes=178)jester1024_cutoff_300_ssssssss/

net = EncoderDecoder(
        encoder=BIJesterEncoder,
        decoder=CoupledLSTMDecoder,
        encoder_kwargs={"freeze": False},#, "pretrained_path": "/home/farzaneh/PycharmProjects/pretrained_nets/fully_conv_net_on_smtsmt_20170627/model.checkpoint"},
        decoder_kwargs={"embedding_size": 256, "hidden_size": 1024, "num_lstm_layers": 1, 
        "vocab_size": 2986, "num_step" :17}, 
        gpus=[0]).cuda()
net = DataParallelWrapper(net, device_ids=[0]).cuda(0)

gpus: [0]


## B) Loading weights

In [92]:
path = '/home/farzaneh/PycharmProjects/pytorch-captioning/results/clapnet_lbl_step2_lr0001_w1'
# path = '/home/farzaneh/PycharmProjects/pytorch-captioning/results/clapnet_captioning_only_f0.1'

checkpoint = torch.load(path + '/model.best')


net.load_state_dict(checkpoint["model"])


## C) Load Tokenizer

In [93]:
from ptcap.data.tokenizer import Tokenizer

USER_MAXLEN=17
tokenizer = Tokenizer(user_maxlen=USER_MAXLEN)
tokenizer.load_dictionaries(path)

Counter({'COFFEEMIX': 1,
         'RAJAH': 1,
         'BUBBLES': 1,
         'GARLIC': 1,
         'JOY': 1,
         'COVERING': 1,
         'TOTE': 1,
         'FORKS': 1,
         'CUPCAKE': 1,
         'PACK': 1,
         'RASAM': 1,
         'GEOMETRY': 1,
         'STRAP': 1,
         'COLORS': 1,
         'COLOURPENCILS': 1,
         'SHOT': 1,
         'FOLDING': 1,
         'CASE': 1,
         'ORANGES': 1,
         'SYRINGE': 1,
         'JEANS': 1,
         'SUPPLY': 1,
         'STEERING': 1,
         'CHEAPEL': 1,
         'HOLD': 1,
         'JACKET': 1,
         'LIPBALM': 1,
         'PEARLS': 1,
         'HANDLE': 1,
         'MIXI': 1,
         'WHITENER': 1,
         'LEGS': 1,
         'DOSA': 1,
         'SHOOTS': 1,
         'SKATE': 1,
         'TAIL': 1,
         'SERVING': 1,
         'JACK': 1,
         'PUSHPIN': 1,
         'FEVER': 1,
         'MACHINE': 1,
         'QUETIP': 1,
         'VOICE': 1,
         'PAMPER': 1,
         'SWAB': 1,
         'CAMPH

## D) Testing 

In [166]:
TMP_VIDEO_FILENAME = 'tmp.mp4'

def unpreprocess(video):
    video = video.data.numpy()[0]
    video = 64. * video.transpose(1, 2, 3, 0)
    return np.array(video, 'uint8')

def demo(net, preprocessor, filename, top_n=5):
    # Open mpeg file and get a numpy array
    video_uint8 = open_mpeg_video(filename, 12, [128, 128])
    # Preprocessing
    video = preprocessor(video_uint8)
    # Convert to torch variable
    video = Variable(torch.from_numpy(video[None]), volatile=True).cuda()
    empty_caption = Variable(torch.zeros([1, 1]), volatile=True).long().cuda()
    
    # Compute predictions
    pred, class_pred = net.forward((video, empty_caption), use_teacher_forcing=False)
    # Convert to numpy 
    pred = np.exp(pred.cpu().data.numpy())[0]
        
    pred_argmax = np.argmax(pred, axis=1)
    decoded_pred = tokenizer.decode_caption(pred_argmax)
    beautiful_caption = ''.join(str(e+" ") for e in decoded_pred if "<END>" not in e)
    print('__CAPTION__: {}'.format(beautiful_caption))
    
    actions = get_template(decoded_pred, templates, tokenizer)
    # print(actions)
    objects = get_object_tokens(decoded_pred, actions[0][0])

    
    extract_objects(objects)
    # Print class name with proba
    # Save input video in tmp file
    vwrite(TMP_VIDEO_FILENAME, unpreprocess(video.cpu()))
    return show_video([TMP_VIDEO_FILENAME, filename])


def path_generator(annotation_path, root_path):
    with gzip.open(annotation_path, "rt") as f:
        annotations = json.load(f)
    files = [elem['file'] for elem in annotations]
    labels = [elem['label'] for elem in annotations]
    placeholders = [elem['placeholders'] for elem in annotations]
    return ((os.path.join(root_path, f), label, p) for f,label,p in zip(files, labels, placeholders))

In [167]:
# Path generator
path_gen = path_generator('/data/20bn-somethingsomething/json/test_20170929.json.gz', 
                          '/data/20bn-somethingsomething/videos')
# Put the netwoark in evaluation mode
_ = net.eval()

#### Preprocessor

In [168]:
from rtorchn.data.preprocessing import default_evaluation_preprocesser

preprocessor = default_evaluation_preprocesser([48, 96, 96], 64.)

In [169]:
for i in range(5000):
    path_to_video, label, _ = next(path_gen)

## Longest Common Subsequence

In [184]:
def get_template(candidates, templates, tokenizer):
   
    lcs = LCS([fscore], tokenizer)
    
    max_templates = []
    #print("There are {} templates".format(len(templates)))

    candidates = [" ".join(candidates)]
    for candidate in candidates:
        
        max_lcs_template = ""
        max_lcs_value = -1
        for template in templates:
            lcs_value = compute_LCS(lcs, candidate, template, tokenizer)
            if lcs_value > max_lcs_value:
                max_lcs_template = template
                max_lcs_value = lcs_value
        max_templates.append((max_lcs_template, max_lcs_value))
#         print("Candidate: {}".format(candidate))
        print("MATCHED ACTION : {}".format(max_lcs_template))

    return max_templates


def compute_LCS(lcs, candidate, template, tokenizer):
    encoded_caption = Variable(
        torch.LongTensor([tokenizer.encode_caption(candidate)]))
    encoded_prediction = Variable(
        torch.LongTensor([tokenizer.encode_caption(template)]))
    score_attr = namedtuple("ScoresAttr", "string_captions captions predictions")
    in_tuple = score_attr([candidate], encoded_caption, encoded_prediction)
    lcs_output = lcs(in_tuple)
    return lcs_output['fscore']

def extract_objects(object_tokens_list):
    
    objects_list = []
    if len(object_tokens_list) == 0:
        return objects_list
    
    next_token_ind =  object_tokens_list[0][0]
    current_object = ""
    for  (ind, token) in object_tokens_list:
        if  next_token_ind == ind:
            current_object += token+" "
        else:
            objects_list.append(current_object+" ")
            current_object = token
            next_token_ind = ind
        next_token_ind += 1
        
        
    if len(current_object)>0:
        objects_list.append(current_object)
               
    print("PREDICTED OBJECTS: {}".format(objects_list))
    return objects_list



def get_object_tokens(caption, template):
    return [(i,token) for (i,token) in enumerate(caption) if token not in template and token not in stop_words]

In [185]:
annotations = JsonParser.open_annotation("/data/20bn-somethingsomething/json/train_20171031.json.gz")
templates = np.unique(annotations["template"]) # A list of templates
templates = [" ".join(tokenizer.tokenize(t)) for t in templates]


# get_objects(sentences1[0].split(), sentence1_templates[0].split())



In [186]:
list1 =  ['HOLDING', 'A', 'DECK', 'OF', 'CARDS', 'IN', 'FRONT', 'OF', 'SCISSORS', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>']
beautiful_caption = ''.join(str(e+" ") for e in list1 if "END" not in e)


In [187]:
beautiful_caption

'HOLDING A DECK OF CARDS IN FRONT OF SCISSORS '

In [188]:
for i in range(100):
    path_to_video, label, placeholders = next(path_gen)

    print('TARGET: {}\n'.format(label))
    demo(net, preprocessor, path_to_video)
    print('PLACEHOLDERS: {}'.format(placeholders))
    print('{}\n'.format('-'*65))
    

TARGET: poking Speaker so lightly that it doesn’t or almost doesn’t move

__CAPTION__: PRETENDING TO PUT A PAPER CLIP UNDERNEATH A BOX 
MATCHED ACTION : PRETENDING TO PUT SOMETHING UNDERNEATH SOMETHING
PREDICTED OBJECTS: ['PAPER CLIP  ', 'BOX']
PLACEHOLDERS: ['Speaker']
-----------------------------------------------------------------

TARGET: poking Bottle so that it falls over

__CAPTION__: POKING A STACK OF PLASTIC CUPS SO THE STACK COLLAPSES 
MATCHED ACTION : POKING A STACK OF SOMETHING SO THE STACK COLLAPSES
PREDICTED OBJECTS: ['PLASTIC CUPS ']
PLACEHOLDERS: ['Bottle']
-----------------------------------------------------------------

TARGET: showing that cube is inside plastic container

__CAPTION__: SHOWING THAT BOWL IS EMPTY 
MATCHED ACTION : SHOWING THAT SOMETHING IS EMPTY
PREDICTED OBJECTS: ['BOWL ']
PLACEHOLDERS: ['cube', 'plastic container']
-----------------------------------------------------------------

TARGET: pretending to take mobile from floor

__CAPTION__: TOUCHING

PLACEHOLDERS: ['stick']
-----------------------------------------------------------------

TARGET: holding onion in front of grater

__CAPTION__: PUTTING A CAN NEXT TO A BOTTLE 
MATCHED ACTION : PUTTING SOMETHING NEXT TO SOMETHING
PREDICTED OBJECTS: ['CAN  ', 'BOTTLE']
PLACEHOLDERS: ['onion', 'grater']
-----------------------------------------------------------------

TARGET: taking chalk out of box of chalk

__CAPTION__: TAKING A PENCIL OUT OF A BOX 
MATCHED ACTION : TAKING SOMETHING OUT OF SOMETHING
PREDICTED OBJECTS: ['PENCIL  ', 'BOX']
PLACEHOLDERS: ['chalk', 'box of chalk']
-----------------------------------------------------------------

TARGET: putting teddy bear that can’t roll onto a slanted surface, so it slides down

__CAPTION__: PUTTING A LOCK THAT CANT ROLL ONTO A SLANTED SURFACE SO IT 
MATCHED ACTION : PUTTING SOMETHING THAT CANT ROLL ONTO A SLANTED SURFACE SO IT SLIDES DOWN
PREDICTED OBJECTS: ['LOCK ']
PLACEHOLDERS: ['teddy bear']
---------------------------------------

__CAPTION__: TAKING A KNIFE FROM TOP OF THE WATER JUG 
MATCHED ACTION : SHOWING A PHOTO OF SOMETHING TO THE CAMERA
PREDICTED OBJECTS: ['TAKING  ', 'KNIFEFROM TOP  ', 'WATERJUG ']
PLACEHOLDERS: ['scissor', 'box']
-----------------------------------------------------------------

TARGET: lifting book with bat on it

__CAPTION__: PULLING A PIECE OF PAPER OUT OF THE BOOK 
MATCHED ACTION : PULLING SOMETHING OUT OF SOMETHING
PREDICTED OBJECTS: ['PIECE  ', 'PAPER ', 'BOOK']
PLACEHOLDERS: ['book', 'bat']
-----------------------------------------------------------------

TARGET: spinning deodrant that quickly stops spinning

__CAPTION__: SPINNING A BALL THAT QUICKLY STOPS SPINNING 
MATCHED ACTION : SPINNING SOMETHING THAT QUICKLY STOPS SPINNING
PREDICTED OBJECTS: ['BALL ']
PLACEHOLDERS: ['deodrant']
-----------------------------------------------------------------

TARGET: pretending to pick towel up

__CAPTION__: PICKING PILLOW UP 
MATCHED ACTION : PICKING SOMETHING UP
PREDICTED OBJECTS: ['PIL

PLACEHOLDERS: ['tablet strip']
-----------------------------------------------------------------

TARGET: taking one of many steel cups

__CAPTION__: TAKING A GLASS BOTTLE 
MATCHED ACTION : TAKING SOMETHING FROM SOMEWHERE
PREDICTED OBJECTS: ['GLASS BOTTLE ']
PLACEHOLDERS: ['one of many steel cups']
-----------------------------------------------------------------

TARGET: pouring vodka into cup

__CAPTION__: POURING WATER INTO CUP 
MATCHED ACTION : POURING SOMETHING INTO SOMETHING
PREDICTED OBJECTS: ['WATER  ', 'CUP']
PLACEHOLDERS: ['vodka', 'cup']
-----------------------------------------------------------------

TARGET: deodorant falling like a rock

__CAPTION__: A PEN FALLING LIKE A ROCK 
MATCHED ACTION : SOMETHING FALLING LIKE A ROCK
PREDICTED OBJECTS: ['PEN ']
PLACEHOLDERS: ['deodorant']
-----------------------------------------------------------------

TARGET: putting an apple onto a plate 

__CAPTION__: PUTTING ORANGE ONTO ORANGE BOWL 
MATCHED ACTION : PUTTING SOMETHING ONTO SOM