# Evaluation of Results of "TUD-MMC at MediaEval 2016: Context of Experience task" by Wang & Liem

## Necessary imports

In [1]:
import pandas as pd
import os.path

train_path = "res/coe_dataset_icpr/dev_set/"
test_path = "res/coe_dataset_icpr/test_set/"

audio_folder = "audio_descriptors/"
text_folder = "text_descriptors/"
vis_folder = "vis_descriptors/"
metadata_folder = "XML/"

train_entries_path = "res/CoeTraining.csv"

## Datasets

Features are built in the manner described in the paper of Wang & Liem or "Right Inflight? A Dataset for Exploring the Automatic
Prediction of Movies Suitable for a Watching Situation" (https://mmsys2016.itec.aau.at/papers/MMSYS/a45-riegler.pdf), if Wang & Liem do not provide any information.

This leads to following set-up:

Metadata: (language, year published, genre, country, runtime and age rating) - from XML<br>
Text: as is td-idf <br>
Audio: Averaged of all Frames (NaN to 0) - Mel-Frequency Cepstral Coefficients<br>
Visual: as is - Histogram of Oriented Gradients (HOG) gray, Color Moments, local binary patterns (LBP) and Gray Level Run Length Matrix

NOTE: Training data - invalid entry (2_states, also in test set), (Moulin_Rouge!.mp4, should be Moulin_Rouge! --> fixed)

In [19]:
df_base_train = pd.read_csv(train_entries_path)
df_base_train = df_base_train[df_base_train['file_name'] != '2_States'] # remove invalid entry
df_base_train.reset_index(inplace=True, drop=True)
df_base_train.head(5)

Unnamed: 0,movie_name,file_name,goodforairplanes
0,Seventh Son,Seventh_Son,1
1,Welcome to Me,Welcome_to_Me,0
2,The Judge,The_Judge,0
3,Transformers Age of Extinction,Transformers__Age_of_Extinction,0
4,The Normal Heart,The_Normal_Heart,1


In [28]:
def get_audio_features(file_name, use_train=True):
    """
        returns 1x14 dataframe, with averaged Mel-Frequency Cepstral Coefficients + file_name
    """
    base_path = train_path if use_train else test_path
    file_path = os.path.join(base_path, audio_folder, file_name + ".csv")
    if not os.path.isfile(file_path):
        print(file_name, " does not exist!")
        return pd.DataFrame(columns=[str(x) for x in range(13)] + ['file_name'])
    df_audio = pd.read_csv(file_path, header=None).T # transpose (columns are rows)
    df_audio = df_audio.fillna(0) # nan values are treated as 0
    df_audio = pd.DataFrame(df_audio.mean(axis=0)).T # average accross columns
    df_audio['file_name'] = file_name
    return df_audio

def get_all_audio_features(df, use_train=True):
    """
        returns nx14 dataframe, containing audio features for all movies
    """
    dfs = []
    for file_name in df['file_name']:
        dfs.append(get_audio_features(file_name, use_train))
    
    return pd.concat(dfs)

def get_all_text_features(df, use_train=True):
    """
        returns nx3284 dataframe, containing tf-idf features for all movies
        the dataset creators messed up - contains several terms multiple times
        ordered alphabetically (?) - Live_Nude_Girls and Transformers__Age_of_Extinction where switched (detected perchance)
    """
    base_path = train_path if use_train else test_path
    df_txt = pd.read_csv(train_path + text_folder + "tdf_idf_dev.csv")
    # the creators of the dataset missed how csv-files work - so we transpose and drop empty rows to get the correct format
    cols = df_txt.columns 
    df_txt = df_txt.T.dropna()
    df_txt.columns = cols
    df_txt.reset_index(inplace=True, drop=True)
    df_txt['file_name'] = sorted(df['file_name']) # we assume the info to be order alphabetically, as we do not have more info
    return df_txt

In [4]:
df_audio_train = get_all_audio_features(df_base_train)
df_audio_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,file_name
0,67.562481,-4.5252,1.6463,-0.597742,1.36281,-1.361579,0.170381,-0.500409,-0.050339,-0.269793,-0.232489,-0.197553,0.13264,Seventh_Son
0,61.548885,-7.146781,-1.103407,-1.58802,0.249743,0.003055,-2.313552,0.371521,0.516853,-1.405396,-0.951247,1.316795,-0.095459,Welcome_to_Me
0,65.038918,-4.171344,-0.455058,-0.094249,-0.365649,-0.182148,0.700715,-0.195335,-0.204333,-1.281841,0.301911,-0.197494,0.861993,The_Judge
0,64.544291,-3.661545,-0.010532,-0.802876,-0.614974,-0.255984,0.810787,0.465672,0.271618,-0.691701,0.16124,0.310825,0.21462,Transformers__Age_of_Extinction
0,60.433903,0.148386,1.713255,-0.203955,-1.187262,-2.310341,-1.726492,-0.512949,0.270257,-0.098537,0.705479,-0.059131,0.247545,The_Normal_Heart


In [29]:
df_txt_train = get_all_text_features(df_base_train)
df_txt_train.head(5)

Unnamed: 0,24000,baby,baseball,big,doc,escort,frozen,heroes,high,huck,...,york,yorks,young,young.1,younger,youngja,zebra,zellweger,zoologists,file_name
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A_Fish_Called_Wanda
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A_Goofy_Movie
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A_Million_Ways_to_Die_in_the_West
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A_Single_Man
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.051657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,American_Gangster
