In [None]:
import base64
from collections import namedtuple
import io
from pathlib import Path
import random
import re

import pandas as pd
import numpy as np
from scipy import stats
import tensorflow as tf

from tqdm import tqdm_notebook as tqdm

from IPython.display import HTML

In [None]:
# please check if anyone is using the GPU first before running experim
!nvidia-smi

In [None]:
# Check if GPU working with tensorflow
with tf.Session() as sess:
    devices = sess.list_devices()
devices

# Data Structure

In [None]:
# data structure
!tree -L 4 data/

# Key Information

From http://multimediaeval.org/mediaeval2018/memorability/index.html

## Schedule:
+ Data Release: 25 June 2018
+ Runs Due: 1 October 2018
+ Working Paper Notes Due: 17 October

## Task Description
+ Automatically predict memorability scores for videos, which reflect the probability of a video being remembered.
+ videos with memorability annotations, and pre-extracted state-of-the-art visual features
+ The ground truth has been collected through recognition tests
+ ‘short-term’ and ‘long-term’ memorability annotations
+ Optionally, descriptive titles attached to the videos may be used
+ allowed to use external data.

## Data
+ 10,000 short (soundless) videos extracted from raw footage used by professionals when creating content.

Pre extracted features
+ HoG descriptors
+ LBP
+ GIST
+ Color Histogram
+ Fc7 layer from Inception
+ C3D features
+ etc

## Evaluation
+ The outputs of the prediction models – i.e., the predicted memorability scores for the videos – will be compared with ground truth memorability scores using classic evaluation metrics (e.g., Spearman’s rank correlation).

# Examine Provided Features

In [None]:
dev_set = Path('data/raw/Memorability 2018/dev-set')
video_dir = dev_set/'sources'
inception_features = dev_set / 'features/InceptionV3/'

In [None]:
# readme <corrupted somehow>
print((dev_set/'README.txt').open('r').read())

# Videos

In [None]:
# videos
video_n = 5

videos = sorted((dev_set / 'sources').iterdir())
video = videos[video_n].open('r+b').read()
encoded = base64.b64encode(video)
HTML(data='''<video alt="test" controls width=300>
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii')))

# Captions

In [None]:
# video captions
video_captions = list((dev_set / 'dev-set_video-captions.txt').open('r'))
video_captions = [i.split('\t') for i in video_captions]
video_captions = [[a, b.strip()] for a, b in video_captions]
video_captions = pd.DataFrame(video_captions, columns=['video_path', 'caption'])
video_captions = video_captions.set_index('video_path')['caption']
video_captions.head()

# Ground Truth

In [None]:
# examine ground truth
ground_truth_file = dev_set / 'ground-truth/ground-truth_dev-set.csv'
ground_truth = pd.read_csv(ground_truth_file)
ground_truth = ground_truth.rename(columns=lambda x: x.replace('-', '_'))
ground_truth.head()

# Inception Features

In [None]:
!ls "{inception_features}" | head -10

There seems to be three images per video

In [None]:
!cat "{inception_features}"/video10-0.txt

Set of key-value pairs, with index between 0-999

Values should sum up to 1

In [None]:
def parse_inception_feature(s):
    pairs = s.strip().split(' ')
    pairs = [i.split(':') for i in pairs]
    return {int(k): float(v) for k, v in pairs}

In [None]:
inception_feature_file = next(inception_features.iterdir()).open('r').read()
sample_inception_feature = parse_inception_feature(inception_feature_file)
sample_inception_feature

Lets assume that if a key is not present, then it was 0

In [None]:
sum(sample_inception_feature.values())

Sums to *almost* 1

Some rounding errors present

Finally, we need a way to convert this to a 1000-vector

In [None]:
def expand_inception_feature(d):
    feature = np.zeros(1000)
    for k, v in d.items():
        feature[k] = v
    return feature

In [None]:
sample_inception_feature2 = expand_inception_feature(sample_inception_feature)
sample_inception_feature2.sum()

In [None]:
# now combine two
def parse_and_expand_inception_feature(path):
    s = path.open('r').read()
    feature = parse_inception_feature(s)
    return  expand_inception_feature(feature)

# Dummy Base line

What evaluation metric would we get if we just reported the average memorability, or randomly shuffled them

In fact, they organisers weren't very clear what the actual evaluation criteria would be, other than it's to do with sorting...

In [None]:
# average short term memorability
avg_short_term = ground_truth['short_term_memorability'].mean()
avg_long_term = ground_truth['long_term_memorability'].mean()
print("Avarage short term: {:.4f}".format(avg_short_term))
print("Average long term:  {:.4f}".format(avg_long_term))

In [None]:
n = 1000
trials = 100

dummy = []

for i in range(trials):
    x = np.random.rand(n)
    y = ground_truth['short_term_memorability'].sample(n, replace=False).values

    dummy += [stats.pearsonr(x, y)]

dummy = pd.DataFrame(dummy, columns=['pearson', 'p-value'])
dummy.mean()

Our pearson score should be above 0, and consequently have a p-value lower than 0.48

Note that there is a high variation, and sometimes the result gives large high pearson correlations and low p-values purely by accident.
See next 2 cells.

Taking an average over 100 trials seems to give an accurate result

In [None]:
# standard deviation

dummy.std()

In [None]:
# best value in trail

dummy.sort_values('pearson', ascending=False).iloc[0]

# First Attempt: Feed Forward Network from images

Strategy: Single dense layer, input image, output is memorability

Final output is logistic to force it to be probability

Loss function is square loss

Metric is 100 examples of 10 videos, and we order them and calculate pearson rank.  
We may need to implement this manually.

In [None]:
def parse_inception_fname(fname):
    s = str(fname)
    match = s.split('-')[-1].split('.')[0]
    return int(match)

In [None]:
dataset = ground_truth.set_index('video').to_dict(orient='index')
for video, data in tqdm(dataset.items()):
    data['source'] = str(video_dir / video)
    glob_string = '{}-*.txt'.format(video.split('.')[0])
    inception_files = inception_features.glob(glob_string)
    data['inception_features'] = parse_and_expand_inception_feature(
        sorted(inception_files)[0])
    data['description'] = video_captions.loc[video]

In [None]:
def choose_eval_status(train_ratio=0.5, val_ratio=0.25,
                       test_ratio=0.25):
    ratio_sum = train_ratio + val_ratio + test_ratio
    assert np.isclose(ratio_sum, 1)
    
    probs = [train_ratio, val_ratio, test_ratio]
    choices = ['train', 'val', 'test']
    return np.random.choice(choices, p=probs)

videos = sorted(dataset.keys())
annotations = pd.DataFrame(videos, columns=['video'])
annotations['eval_status'] = [choose_eval_status()
                              for i in range(len(annotations))]
annotations.head()

In [None]:
feature_names = ["short_term_memorability", 
                 "long_term_memorability",
                 "inception_features",
                 "description"]
Feature = namedtuple("Feature", feature_names)

def tf_generator(eval_status, shuffle=True):
    assert eval_status in ['train', 'val', 'test']
    
    videos = annotations['video']
    videos = videos[annotations['eval_status']==eval_status].values
    
    def f():
        if shuffle:
            random.shuffle(videos)
        for video in videos:
            feature_dict = {feature: dataset[video][feature] 
                            for feature in feature_names}
            yield Feature(**feature_dict)
    return f

In [None]:
def process_features(short_term_memorability, long_term_memorability,
                     inception_features, description):
    return short_term_memorability, inception_features

In [None]:
def create_data_init_op(eval_mode, batch_size, iterator, shuffle=True):
    dataset = tf_generator(eval_mode, shuffle)
    output_types = (tf.float32, tf.float32, tf.float32, tf.string)
    dataset = tf.data.Dataset.from_generator(dataset,
                                             output_types=output_types)
    dataset = dataset.map(process_features).batch(batch_size).prefetch(1)
    return iterator.make_initializer(dataset)

In [None]:
def create_pipeline(batch_size):
    input_types = (tf.float32, tf.float32)
    input_shapes = (tf.TensorShape(None),
                    tf.TensorShape([None, tf.Dimension(1000)]))
    iterator = tf.data.Iterator.from_structure(input_types, 
                                               input_shapes)
    memorability, inception_features = iterator.get_next()
    init_train_pipeline_op = create_data_init_op('train', batch_size, 
                                                 iterator)
    init_val_pipeline_op = create_data_init_op('val', batch_size,
                                               iterator)
    init_test_pipeline_op = create_data_init_op('test', batch_size,
                                                iterator)
    return ((memorability, inception_features), 
            (init_train_pipeline_op, init_val_pipeline_op,
             init_test_pipeline_op))

((memorability, inception_features),
 (train_pipeline, val_pipeline, test_pipeline)) = create_pipeline(32)

In [None]:
sess = tf.Session()

In [None]:
dense = tf.layers.dense(inception_features, units=1)
loss = tf.losses.mean_squared_error(labels=memorability,
                                    predictions=dense)

mean_loss_op, mean_loss_update_op = tf.metrics.mean(loss, name='metric')
running_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES,
                                         scope="metric")
reset_metrics_op = tf.variables_initializer(var_list=running_vars)

optimizer = tf.train.GradientDescentOptimizer(1e-3)
train_op = optimizer.minimize(loss)

In [None]:
ground_truth['short_term_memorability'].std()

In [None]:
sess.run(tf.global_variables_initializer())
sess.run(val_pipeline)

# original val loss
sess.run(reset_metrics_op)
while True:
    try:
        sess.run(mean_loss_update_op)
    except tf.errors.OutOfRangeError:
        break
print("Original validation loss")
print(sess.run(mean_loss_op))

# train for one epoch
for i in range(100):
    sess.run(train_pipeline)
    while True:
        try:
            sess.run(train_op)
        except tf.errors.OutOfRangeError:
            break

# loss after training
sess.run(reset_metrics_op)
sess.run(val_pipeline)
while True:
    try:
        sess.run(mean_loss_update_op)
    except tf.errors.OutOfRangeError:
        break
print("Loss after one hundred epochs")
print(sess.run(mean_loss_op))

In [None]:
# test ranking error
sess.run(test_pipeline)
predictions = []
while True:
    try:
        predictions += [sess.run(dense)]
    except tf.errors.OutOfRangeError:
        break

In [None]:
predictions = [item.item() for sublist in predictions for item in sublist]

In [None]:
test_idx = annotations[annotations['eval_status']=='test'].index
test_ground_truth = ground_truth.loc[test_idx]['short_term_memorability']

In [None]:
stats.pearsonr(test_ground_truth, predictions)

In [None]:
pd.DataFrame(data=[test_ground_truth.values, predictions],
             index=['ground truth', 'predictions']).T

We can do smarter things than this, need to read the literature first

# Resources

http://multimediaeval.org/mediaeval2018/memorability/index.html

MediaEval Working Notes 2018 - Google Doc

Recommended Papers:
1. Aditya Khosla, Akhil S Raju, Antonio Torralba, and Aude Oliva. 2015. [Understanding and predicting image memorability at a large scale](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/Khosla_Understanding_and_Predicting_ICCV_2015_paper.pdf). In Proc. IEEE Int. Conf. on Computer Vision (ICCV). 2390–2398.
2. Phillip Isola, Jianxiong Xiao, Devi Parikh, Antonio Torralba, and Aude Oliva. 2014. [What makes a photograph memorable?](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6629991) IEEE Transactions on Pattern Analysis and Machine Intelligence 36, 7 (2014), 1469–1482.
3. Hammad Squalli-Houssaini, Ngoc Duong, Marquant Gwenaëlle, and Claire-Hélène Demarty. 2018. [Deep learning for predicting image memorability](https://hal.archives-ouvertes.fr/hal-01629297/document). In Proc. IEEE Int. Conf. on Audio, Speech and Language Processing (ICASSP).
4. Junwei Han, Changyuan Chen, Ling Shao, Xintao Hu, Jungong Han, and Tianming Liu. 2015. [Learning computational models of video memorability from fMRI brain imaging](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6919270). IEEE transactions on cybernetics 45, 8 (2015), 1692–1703.
5. Sumit Shekhar, Dhruv Singal, Harvineet Singh, Manav Kedia, and Akhil Shetty. 2017. [Show and Recall: Learning What Makes Videos Memorable].(http://openaccess.thecvf.com/content_ICCV_2017_workshops/papers/w40/Shekhar_Show_and_Recall_ICCV_2017_paper.pdf) In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2730–2739.
6. Romain Cohendet, Karthik Yadati, Ngoc K.Q. Duong and Claire-Hélène Demarty. 2018. [Annotating, Understanding, and Predicting Long-term Video Memorability](https://sci-hub.tw/10.1145/3206025.3206056)
. In Proceedings of the ACM International Conference on Multimedia Retrieval (ICMR).

# Post Evaluation Updates

Results were pretty poor compared to other entries, even other entries doing simple analysis.

Accoriding to [winning paper](http://ceur-ws.org/Vol-2283/MediaEval_18_paper_31.pdf), we should have been able to achieve spearman correlation of 0.092 using just inception features. However we achieved -0.017.

I suspect that the logistic regression was a poor choice.

Lets try again, a few more things

## Iteration 1: Improve code

Here we fix the softmax code I was using, to make sure it's working correctly.
Note that the labels are not 1, 0 but probalities. Hence we need to add 1-memorability to ensure cross entropy works correctly.

Add an evaluate function for easier evaluation.

Also we forgot to use a softmax to generate predictions

Was calculating pearson r in some cases, instead of spearman correlation by mistake.

Also changed spearman correlation to gaurantee that predictions and labels are from the same rows. I don't think I was making this mistake, but I've changed the code to make sure this cannot happen.

In [None]:
# we will reuse the tensorflow variables created above, but create a new session
sess.close()
sess = tf.Session()

In [None]:
# reuse memorability, inception_features, train_pipeline, val_pipeline, test_pipeline variables
# note: tensorflow is probabily overkill for this, but we can use the same template when we start analysing videos

# model
dense = tf.layers.dense(inception_features, units=2)
preds = tf.nn.softmax(dense)[:, 1]
labels = tf.transpose(tf.convert_to_tensor([memorability, 1-memorability]))
errors = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=dense))
loss = tf.reduce_mean(errors)

# metrics
running_loss, running_loss_update = tf.metrics.mean(errors, name="metric")
running_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metric")
reset_metrics_op = tf.variables_initializer(var_list=running_vars)

# optimizers
optimizer = tf.train.GradientDescentOptimizer(1e-3)
train_op = optimizer.minimize(loss)

In [None]:
sess.run(tf.global_variables_initializer())

examine variables from this model

In [None]:
sess.run(train_pipeline)

(sample_memorability,
 sample_labels,
 sample_prediction,
 sample_loss) = sess.run([memorability, labels, preds, loss])

# reset session for training
sess.close()
sess = tf.Session()

In [None]:
sample_memorability

In [None]:
sample_labels[:10]

In [None]:
sample_prediction[:10]

In [None]:
sample_loss

In [None]:
# method to evalutate model

def evaluate(preds, running_loss, running_loss_update, reset_metrics_op,
             memorability=memorability, train_pipeline=train_pipeline, test_pipeline=test_pipeline):
    # evaluate on training dataset
    sess.run(reset_metrics_op)
    sess.run(train_pipeline)
    train_predictions, train_labels = [], [] 
    while True:
        try:
            _, a, b = sess.run([running_loss_update, preds, memorability])
            train_predictions = np.append(train_predictions, a)
            train_labels = np.append(train_labels, b)
        except tf.errors.OutOfRangeError:
            break
    train_xent = sess.run(running_loss)
    results = pd.DataFrame([train_predictions, train_labels], index=['memorability', 'pred']).T
    train_spearman = results.corr(method='spearman').iloc[0, 1]

    # evaluate on test dataset
    sess.run(reset_metrics_op)
    sess.run(test_pipeline)
    test_predictions, test_labels = [], [] 
    while True:
        try:
            _, a, b = sess.run([running_loss_update, preds, memorability])
            test_predictions = np.append(test_predictions, a)
            test_labels = np.append(test_labels, b)
        except tf.errors.OutOfRangeError:
            break
    test_xent = sess.run(running_loss)
    results = pd.DataFrame([test_predictions, test_labels], index=['memorability', 'pred']).T
    test_spearman = results.corr(method='spearman').iloc[0, 1]

    return pd.DataFrame({
        'xent': {'test': test_xent, 'train': train_xent},
        'spearman': {'test': test_spearman, 'train': train_spearman}
    }).T

In [None]:
sess.run(tf.global_variables_initializer())

loss before training

In [None]:
sess.run(test_pipeline)

evaluate(preds, running_loss, running_loss_update, reset_metrics_op)

In [None]:
# train for one epoch
for i in range(1):
    sess.run(train_pipeline)
    while True:
        try:
            sess.run(train_op)
        except tf.errors.OutOfRangeError:
            break

sess.run(reset_metrics_op)
sess.run(val_pipeline)
while True:
    try:
        sess.run(running_loss_update)
    except tf.errors.OutOfRangeError:
        break

evaluate(preds, running_loss, running_loss_update, reset_metrics_op)

In [None]:
# train for 100 epochs
for i in tqdm(range(99)):
    sess.run(train_pipeline)
    while True:
        try:
            sess.run(train_op)
        except tf.errors.OutOfRangeError:
            break

evaluate(preds, running_loss, running_loss_update, reset_metrics_op)

Marginal improvement in loss, train and test loss essentially the same

In [None]:
# train for 1000 epochs
for i in tqdm(range(900)):
    sess.run(train_pipeline)
    while True:
        try:
            sess.run(train_op)
        except tf.errors.OutOfRangeError:
            break

evaluate(preds, running_loss, running_loss_update, reset_metrics_op)

So it seems we are not able to make the algotihm overfit

I should have spent more time here in retrospect, before moving on

Lets keep iterating until we can get this linear model to overfit

## Iteration 2: Try square loss instead of softmax

In the winning paper, they used LASSO L1 regularized regression.

Lets first use unregularised regression and demonstrate that it can overfit

In [None]:
# we will reuse the tensorflow variables created above, but create a new session
sess.close()
sess = tf.Session()

In [None]:
# reuse memorability, inception_features, train_pipeline, val_pipeline, test_pipeline variables
# note: tensorflow is probabily overkill for this, but we can use the same template when we start analysing videos

# model
dense = tf.layers.dense(inception_features, units=1)
preds = tf.nn.sigmoid(dense)
loss = tf.losses.mean_squared_error(labels=memorability, predictions=preds)

# metrics
running_loss, running_loss_update = tf.metrics.mean(loss, name="metric")
running_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metric")
reset_metrics_op = tf.variables_initializer(var_list=running_vars)

# optimizers
optimizer = tf.train.GradientDescentOptimizer(1e-3)
train_op = optimizer.minimize(loss)

In [None]:
sess.run(tf.global_variables_initializer())

Metrics before training

In [None]:
sess.run(test_pipeline)

# loss before training
evaluate(preds, running_loss, running_loss_update, reset_metrics_op)

In [None]:
# train for one epoch
for i in range(1):
    sess.run(train_pipeline)
    while True:
        try:
            sess.run(train_op)
        except tf.errors.OutOfRangeError:
            break

sess.run(reset_metrics_op)
sess.run(val_pipeline)
while True:
    try:
        sess.run(running_loss_update)
    except tf.errors.OutOfRangeError:
        break

evaluate(preds, running_loss, running_loss_update, reset_metrics_op)

Loss is only marginally smaller, spearman is still negative

In [None]:
# train for 100 epochs
for i in tqdm(range(99)):
    sess.run(train_pipeline)
    while True:
        try:
            sess.run(train_op)
        except tf.errors.OutOfRangeError:
            break

evaluate(preds, running_loss, running_loss_update, reset_metrics_op)

Loss have reduced significantly, spearman is steadily improving

Looks like we might finally be seeing some small overfitting

In [None]:
# train for 1,000 epochs
for i in tqdm(range(900)):
    sess.run(train_pipeline)
    while True:
        try:
            sess.run(train_op)
        except tf.errors.OutOfRangeError:
            break

evaluate(preds, running_loss, running_loss_update, reset_metrics_op)

Significant improvement in cross entropy, but no corresponding improvement in spearman correlation.
Seems to have some slight overfitting, which is positive, but it is too small to tell for certain.

Finally seem to be getting some reasonable results for spearman correlation.

**Todo**

This was pretty similar to one of our [submitted approached](https://gitlab.insight-centre.org/owen.corrigan/memorability/blob/master/src/models/inception_model.py).

So why didn't the other one work? Will have to take a look at this

## Iteration 3: Z-score Label normalization

According to http://ceur-ws.org/Vol-2283/MediaEval_18_paper_31.pdf, they used z-score normalization when training regressors.

In [None]:
# calculate mean and std devaiation of test set.
# apply same values when doing z score normalization of test and val sets

train_videos = annotations[annotations['eval_status']=='train']['video'].values
train_memorability = ground_truth.set_index('video').loc[train_videos]['short_term_memorability']
mean, std = train_memorability.mean(), train_memorability.std()

In [None]:
# we will reuse the tensorflow variables created above, but create a new session
sess.close()
sess = tf.Session()

In [None]:
# model
dense = tf.layers.dense(inception_features, units=1)[:, 0]
preds = dense
memorability_scaled = (memorability - mean) / std
loss = tf.losses.mean_squared_error(labels=memorability_scaled, predictions=dense)

# metrics
running_loss, running_loss_update = tf.metrics.mean(loss, name="metric")
running_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="metric")
reset_metrics_op = tf.variables_initializer(var_list=running_vars)

# optimizers
optimizer = tf.train.GradientDescentOptimizer(1e-3)
train_op = optimizer.minimize(loss)

In [None]:
sess.run(tf.global_variables_initializer())

examine variables from this model

In [None]:
sess.run(test_pipeline)
sample_preds, sample_memorability = sess.run([preds, memorability_scaled])

In [None]:
sample_preds

In [None]:
sample_memorability

In [None]:
sess.run(test_pipeline)

# loss before training
evaluate(preds, running_loss, running_loss_update, reset_metrics_op,
         memorability=memorability_scaled)

Cross entropy / KL Divergence is 10 times higher before being trained.

Maybe this gives more opportunity to learn somehow?

In [None]:
# train for one epoch
for i in range(1):
    sess.run(train_pipeline)
    while True:
        try:
            sess.run(train_op)
        except tf.errors.OutOfRangeError:
            break

sess.run(reset_metrics_op)
sess.run(val_pipeline)
while True:
    try:
        sess.run(running_loss_update)
    except tf.errors.OutOfRangeError:
        break

evaluate(preds, running_loss, running_loss_update, reset_metrics_op,
         memorability=memorability_scaled)

In [None]:
# train for 100 epochs
for i in tqdm(range(99)):
    sess.run(train_pipeline)
    while True:
        try:
            sess.run(train_op)
        except tf.errors.OutOfRangeError:
            break

evaluate(preds, running_loss, running_loss_update, reset_metrics_op)

seeing small overfitting, plus good spearman results.

Good sign

In [None]:
# train for 1,000 epochs
for i in tqdm(range(900)):
    sess.run(train_pipeline)
    while True:
        try:
            sess.run(train_op)
        except tf.errors.OutOfRangeError:
            break

evaluate(preds, running_loss, running_loss_update, reset_metrics_op)

No we are starting to see some more overfitting

what is model predicting?

In [None]:
sess.run(test_pipeline)
(sample_preds,
 sample_memorability,
 sample_memorability_scaled) = sess.run([preds, memorability, memorability_scaled])

In [None]:
sample_preds

What are new scaled values?

In [None]:
sample_memorability_scaled

Notice that memorability may now be negative

## Iteration 4: Beta Distribution

[This comment](https://www.reddit.com/r/MachineLearning/comments/9tptih/d_what_loss_function_to_use_for_probability/e8yyy7d) had an interesting idea


## Iteration 5: Sampling