# Exploratory Data Analysis for C3D

Import libraries:

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Some Matplotlib configuration:

In [2]:
font = { 'family': 'DejaVu Sans', 'weight': 'bold', 'size': 22 }
plt.rc('font', **font)

In [15]:
import seaborn as sns

Import the config file which contains all the paths:

In [3]:
import sys

In [4]:
sys.path.append('..')

In [5]:
import config

**C3D Features**

In [6]:
config.DEV_FEATURES

'/datasets/dev-set/features'

In [7]:
config.DEV_FEATURES_LIST

['C3D',
 'HMP',
 'InceptionV3',
 'LBP',
 'aesthetic_feat_dev-set_mean',
 'ColorHistogram',
 'HOG',
 'ORB']

In [8]:
config.DEV_C3D_FEATURE

'/datasets/dev-set/features/C3D'

In [9]:
C3D_FILENAMES = os.listdir(config.DEV_C3D_FEATURE)

In [10]:
C3D_FILENAMES[:5]

['video3094.txt',
 'video4963.txt',
 'video5381.txt',
 'video3388.txt',
 'video6587.txt']

In [11]:
def read_features(filename):
    with open(filename) as f:
        return { 'C3D_{}'.format(i + 1) : float(feature) for i, feature in enumerate(f.read().split()) }

In [12]:
%%time

features = []
for i, video in enumerate(C3D_FILENAMES):
    video_features = read_features(os.path.join(config.DEV_C3D_FEATURE, video))
    video_features['video'] = video.split('.txt')[0] + '.webm'
    features.append(video_features)

CPU times: user 4.5 s, sys: 2.95 s, total: 7.45 s
Wall time: 56.4 s


In [13]:
dataframe = pd.DataFrame(features)

In [14]:
dataframe.head()

Unnamed: 0,C3D_1,C3D_10,C3D_100,C3D_101,C3D_11,C3D_12,C3D_13,C3D_14,C3D_15,C3D_16,...,C3D_91,C3D_92,C3D_93,C3D_94,C3D_95,C3D_96,C3D_97,C3D_98,C3D_99,video
0,4.3e-05,5.9e-07,2.5e-05,5e-06,3e-06,1e-06,2e-05,0.000213,6e-08,3.9e-05,...,3e-06,2.9e-05,8.3e-07,1.2e-07,0.002093,8.6e-07,2e-08,6e-07,2.9e-05,video3094.webm
1,0.047568,0.00038589,0.019331,0.002406,0.000118,2.6e-05,0.004285,0.00183,8.987e-05,0.000675,...,0.000221,0.00366,0.00073874,4.22e-06,0.043334,0.00012497,3.171e-05,7.573e-05,0.010475,video4963.webm
2,0.144726,2.06e-05,0.000331,0.000445,6.4e-05,0.000282,0.000137,0.00232,8.5e-07,0.000201,...,0.000285,0.000191,2.389e-05,8e-07,0.001503,6.236e-05,3.28e-06,9.49e-06,1.9e-05,video5381.webm
3,0.221757,0.00025114,0.000719,0.005072,0.000382,0.007093,0.010733,0.012519,1.789e-05,0.000469,...,0.001697,0.001409,0.00014135,0.00026708,0.007757,9.373e-05,8.385e-05,0.00062378,8.3e-05,video3388.webm
4,0.000761,0.00163948,2.6e-05,0.000478,1.2e-05,3e-06,0.000834,0.068154,9.23e-06,2.7e-05,...,5e-06,9e-06,3.2e-07,5.04e-06,0.00022,3.13e-06,4.7e-07,6.9e-07,1.3e-05,video6587.webm


In [16]:
dataframe.isnull().values.any()

False

**Groundtruth**

In [17]:
dev_ground_truth = pd.read_csv(config.DEV_GROUNDTRUTH) 

In [18]:
dev_ground_truth.head()

Unnamed: 0,video,short-term_memorability,nb_short-term_annotations,long-term_memorability,nb_long-term_annotations
0,video10.webm,0.95,34,0.9,10
1,video100.webm,0.951,33,0.889,9
2,video10000.webm,0.832,33,1.0,13
3,video10001.webm,0.865,33,0.727,11
4,video10002.webm,0.899,59,0.792,24


Merging both datasets:

In [19]:
dataframe = dataframe.merge(dev_ground_truth)

In [20]:
dataframe.head()

Unnamed: 0,C3D_1,C3D_10,C3D_100,C3D_101,C3D_11,C3D_12,C3D_13,C3D_14,C3D_15,C3D_16,...,C3D_95,C3D_96,C3D_97,C3D_98,C3D_99,video,short-term_memorability,nb_short-term_annotations,long-term_memorability,nb_long-term_annotations
0,4.3e-05,5.9e-07,2.5e-05,5e-06,3e-06,1e-06,2e-05,0.000213,6e-08,3.9e-05,...,0.002093,8.6e-07,2e-08,6e-07,2.9e-05,video3094.webm,0.955,37,0.917,12
1,0.047568,0.00038589,0.019331,0.002406,0.000118,2.6e-05,0.004285,0.00183,8.987e-05,0.000675,...,0.043334,0.00012497,3.171e-05,7.573e-05,0.010475,video4963.webm,0.746,33,0.7,10
2,0.144726,2.06e-05,0.000331,0.000445,6.4e-05,0.000282,0.000137,0.00232,8.5e-07,0.000201,...,0.001503,6.236e-05,3.28e-06,9.49e-06,1.9e-05,video5381.webm,0.956,34,0.75,12
3,0.221757,0.00025114,0.000719,0.005072,0.000382,0.007093,0.010733,0.012519,1.789e-05,0.000469,...,0.007757,9.373e-05,8.385e-05,0.00062378,8.3e-05,video3388.webm,0.82,37,0.818,11
4,0.000761,0.00163948,2.6e-05,0.000478,1.2e-05,3e-06,0.000834,0.068154,9.23e-06,2.7e-05,...,0.00022,3.13e-06,4.7e-07,6.9e-07,1.3e-05,video6587.webm,0.858,32,0.75,12


In [21]:
C3D_COLS = dataframe.filter(regex='C3D').columns

In [22]:
from scipy.stats import spearmanr

In [23]:
correlations = []
for col in C3D_COLS:
    short_corr, short_p_value = spearmanr(dataframe[col], dataframe['short-term_memorability'])
    long_corr, long_p_value = spearmanr(dataframe[col], dataframe['long-term_memorability'])
    correlations.append({
        'column': col,
        'short-term correlation coefficient': short_corr,
        'short-term p-value': short_p_value,
        'long-term correlation coefficient': long_corr,
        'long-term p-value': long_p_value,
    })

In [27]:
pd.DataFrame(correlations).sort_values(by='short-term correlation coefficient', ascending=False).head(10)[
    ['column', 'short-term correlation coefficient', 'short-term p-value']]

Unnamed: 0,column,short-term correlation coefficient,short-term p-value
0,C3D_1,0.191836,3.518843e-67
61,C3D_63,0.161048,1.264062e-47
29,C3D_34,0.159411,1.100879e-46
13,C3D_2,0.115026,5.670003e-25
6,C3D_13,0.109143,1.2457340000000001e-22
60,C3D_62,0.106035,1.917365e-21
100,C3D_99,0.097915,1.6713150000000001e-18
14,C3D_20,0.094951,1.730478e-17
56,C3D_59,0.085681,1.634866e-14
77,C3D_78,0.083836,5.885564e-14


In [28]:
pd.DataFrame(correlations).sort_values(by='long-term correlation coefficient', ascending=False).head(10)[
    ['column', 'long-term correlation coefficient', 'long-term p-value']]

Unnamed: 0,column,long-term correlation coefficient,long-term p-value
0,C3D_1,0.101888,6.510083e-20
61,C3D_63,0.081693,2.517438e-13
29,C3D_34,0.079401,1.14412e-12
60,C3D_62,0.075129,1.718111e-11
56,C3D_59,0.068542,8.416292e-10
6,C3D_13,0.063464,1.33745e-08
13,C3D_2,0.061547,3.602865e-08
100,C3D_99,0.057021,3.335575e-07
63,C3D_65,0.052131,3.083902e-06
14,C3D_20,0.037099,0.0009039205
