# Evaluation of Results of "TUD-MMC at MediaEval 2016: Context of Experience task" by Wang & Liem

## Necessary imports

In [14]:
import pandas as pd
import numpy as np
import os.path
import xml.etree.ElementTree as ET
train_path = "res/coe_dataset_icpr/dev_set/"
test_path = "res/coe_dataset_icpr/test_set/"

audio_folder = "audio_descriptors/"
text_folder = "text_descriptors/"
vis_folder = "vis_descriptors/"
metadata_folder = "XML/"

train_entries_path = "res/CoeTraining.csv"
test_entries_path = "res/CoeTrainingTest.csv"

## Datasets

Features are built in the manner described in the paper of Wang & Liem or "Right Inflight? A Dataset for Exploring the Automatic
Prediction of Movies Suitable for a Watching Situation" (https://mmsys2016.itec.aau.at/papers/MMSYS/a45-riegler.pdf), if Wang & Liem do not provide any information.

This leads to following set-up:

Metadata: (language, year published, genre, country, runtime and age rating) - from XML<br>
Text: as is td-idf <br>
Audio: Averaged of all Frames (NaN to 0) - Mel-Frequency Cepstral Coefficients<br>
Visual: as is - Histogram of Oriented Gradients (HOG) gray, Color Moments, local binary patterns (LBP) and Gray Level Run Length Matrix

NOTE: Training data - invalid entry (2_states, also in test set), (Moulin_Rouge!.mp4, should be Moulin_Rouge! --> fixed)

In [15]:
df_base_train = pd.read_csv(train_entries_path)
df_base_train = df_base_train[df_base_train['file_name'] != '2_States'] # remove invalid entry
df_base_train.reset_index(inplace=True, drop=True)
df_base_train.sort_values(by='file_name', inplace=True)
df_base_train.head(5)
df_targets_train = df_base_train['goodforairplanes']

## Feature extractors
As the dataset was built in a manner that would have been considered dirty already in 2002 a lot of feature extraction is done

In [11]:
def get_audio_features(file_name, use_train=True):
    """
        returns 1x14 dataframe, with averaged Mel-Frequency Cepstral Coefficients + file_name
    """
    base_path = train_path if use_train else test_path
    file_path = os.path.join(base_path, audio_folder, file_name + ".csv")
    if not os.path.isfile(file_path):
        print(file_name, " does not exist!")
        return pd.DataFrame(columns=[str(x) for x in range(13)] + ['file_name'])
    df_audio = pd.read_csv(file_path, header=None).T # transpose (columns are rows)
    df_audio = df_audio.fillna(0) # nan values are treated as 0
    df_audio = pd.DataFrame(df_audio.mean(axis=0)).T # average accross columns
    df_audio['file_name'] = file_name
    return df_audio

def get_all_audio_features(df, use_train=True):
    """
        returns nx14 dataframe, containing audio features for all movies
    """
    dfs = []
    for file_name in df['file_name']:
        dfs.append(get_audio_features(file_name, use_train))
    
    return pd.concat(dfs)

def get_all_text_features(df, use_train=True):
    """
        returns nx3284 dataframe, containing tf-idf features for all movies
        the dataset creators messed up - contains several terms multiple times
        ordered alphabetically (?) - Live_Nude_Girls and Transformers__Age_of_Extinction where switched (detected perchance)
    """
    base_path = train_path if use_train else test_path
    file_path = os.path.join(base_path, text_folder, "tdf_idf_dev.csv") 
    df_txt = pd.read_csv(file_path)
    # the creators of the dataset missed how csv-files work - so we transpose and drop empty rows to get the correct format
    cols = df_txt.columns 
    df_txt = df_txt.T.dropna()
    df_txt.columns = cols
    df_txt.reset_index(inplace=True, drop=True)
    df_txt['file_name'] = sorted(df['file_name']) # we assume the info to be order alphabetically, as we do not have more info
    return df_txt

def get_vis_features(file_name, use_train=True):
    """
        returns 1x1653 dataframe, with unspecified visual features + file_name
        we assume that every single value in the csv is one feature
        this may be wrong, as there are two rows and no documentation (again)
    """
    base_path = train_path if use_train else test_path
    file_path = os.path.join(base_path, vis_folder, file_name + ".csv")
    if not os.path.isfile(file_path):
        print(file_name, " does not exist!")
        return pd.DataFrame(columns=[str(x) for x in range(1652)] + ['file_name'])
    df_vis = pd.read_csv(file_path, header=None)
    df_vis = pd.DataFrame(pd.concat([df_vis.loc[0,:], df_vis.loc[1,:]])).reset_index(drop=True).T # treat each value as single feature (-> no aggregation)
    df_vis['file_name'] = file_name
    return df_vis

def get_all_vis_features(df, use_train=True):
    """
        returns nx1653 dataframe, containing visual features for all movies
    """
    dfs = []
    for file_name in df['file_name']:
        dfs.append(get_vis_features(file_name, use_train))
    
    return pd.concat(dfs)

def get_meta_features(file_name, use_train=True):
    """
        returns 1x7 dataframe, with metadata features + file_name
        One Hot Encoding is not applied here, this should happen later
    """
    base_path = train_path if use_train else test_path
    file_path = os.path.join(base_path, metadata_folder, file_name + ".xml")
    if not os.path.isfile(file_path):
        print(file_name, " does not exist!")
        return pd.DataFrame(columns=['country', 'genre', 'language', 'rated', 'runtime', 'year', 'file_name'])
    etree = ET.parse(file_path)
    movie = etree.getroot().find('movie')
    mv = {}
    mv['language'] = [movie.get('language')]
    mv['year'] = [int(movie.get('year'))]
    mv['genre'] = [movie.get('genre')]
    mv['country'] = [movie.get('country')]
    mv['runtime'] = [int(movie.get('runtime')[:-4])]
    mv['rated'] = [movie.get('rated')]

    df_meta = pd.DataFrame.from_dict(mv)
    df_meta['file_name'] = file_name
    
    return df_meta

def get_all_meta_features(df, use_train=True):
    """
        returns nx7 dataframe, containing metadata features for all movies
    """
    dfs = []
    for file_name in df['file_name']:
        dfs.append(get_meta_features(file_name, use_train))
    
    return pd.concat(dfs)

In [16]:
df_audio_train = get_all_audio_features(df_base_train)
df_audio_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,file_name
0,66.828814,-5.674521,1.670346,1.143263,-0.635255,1.269376,0.633811,0.012407,0.164257,-1.494813,-0.292583,0.086875,-0.914582,A_Fish_Called_Wanda
0,43.908715,-6.380989,1.36179,-1.483354,-0.67004,-1.626537,0.466197,-1.888169,0.848654,-0.990286,0.673634,-0.972964,-0.141539,A_Goofy_Movie
0,3.390978,-6.725758,0.579762,-0.271885,-0.17564,-0.84569,-0.699064,-0.578434,0.537249,-1.387373,0.747223,-0.88758,-0.205273,A_Million_Ways_to_Die_in_the_West
0,57.743484,-3.722123,2.780418,0.756402,0.043743,-0.960622,-0.435575,-0.176729,1.665236,-2.068548,1.211791,-0.358194,0.738827,A_Single_Man
0,65.354709,-5.609515,-1.303409,-0.831993,-0.518848,-0.019373,-0.500203,-0.897985,0.148561,-0.666728,0.033135,0.383797,-0.209412,American_Gangster


In [17]:
df_txt_train = get_all_text_features(df_base_train)
df_txt_train.head(5)

Unnamed: 0,24000,baby,baseball,big,doc,escort,frozen,heroes,high,huck,...,york,yorks,young,young.1,younger,youngja,zebra,zellweger,zoologists,file_name
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A_Fish_Called_Wanda
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A_Goofy_Movie
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A_Million_Ways_to_Die_in_the_West
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A_Single_Man
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.051657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,American_Gangster


In [6]:
df_vis_train = get_all_vis_features(df_base_train)
df_vis_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1643,1644,1645,1646,1647,1648,1649,1650,1651,file_name
0,0.43031,0.38101,0.34082,0.31642,0.41465,0.38599,0.32938,0.31212,0.35067,0.34246,...,362.83,8.5923,9.1427,8.4101,8.7924,1483.3,417.21,892.59,435.28,A_Fish_Called_Wanda
0,0.002031,0.0,0.0,0.07302,0.027533,0.005346,0.006015,0.11824,0.026991,0.005171,...,20278.0,0.97201,1.3654,1.8032,1.4634,168740.0,20896.0,34434.0,20967.0,A_Goofy_Movie
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,119950.0,1e-06,0.002466,4e-06,0.002466,729320.0,119950.0,230400.0,119950.0,A_Million_Ways_to_Die_in_the_West
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9550.7,5.9433,6.0215,4.3839,5.4288,32242.0,10313.0,22308.0,9850.0,A_Single_Man
0,0.23164,0.28629,0.30068,0.28118,0.24489,0.27849,0.29076,0.2938,0.16104,0.16685,...,109.62,17.237,15.543,13.671,15.231,53559.0,8637.4,18597.0,8679.0,American_Gangster


In [7]:
df_meta_train = get_all_meta_features(df_base_train)
df_meta_train.head(5)

Unnamed: 0,language,year,genre,country,runtime,rated,file_name
0,"English, Italian, Russian",1988,"Comedy, Crime","USA, UK",108,R,A_Fish_Called_Wanda
0,English,1995,"Animation, Adventure, Comedy",USA,78,G,A_Goofy_Movie
0,"English, Navajo, Mandarin",2014,"Comedy, Western",USA,116,R,A_Million_Ways_to_Die_in_the_West
0,"English, Spanish",2009,Drama,USA,99,R,A_Single_Man
0,English,2007,"Biography, Crime, Drama","USA, UK",157,R,American_Gangster


In [23]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB # put NOTE into paper - not sure if correct bayes
from sklearn.model_selection import cross_validate


class BaseClassifier:
    
    def __init__(self, clf, df_features, df_targets, clf_name, modality):
        self.clf = clf
        self.df_features = df_features
        self.df_targets = df_targets
        self.clf_name = clf_name
        self.modality = modality
        self.scoring = {'f1': 'f1_micro',
                        'precision': 'precision_micro',
                        'recall': 'recall_micro'}
        
    def cv(self, cv= 10, verbose=True):
        np.random.seed(32143421)
        if verbose:
            print(f"Starting cross validation for classifier {self.clf_name} and modality {self.modality}")
        
        # TODO: this will return different results (CIs, Mean, variance,...)
        return cross_validate(self.clf, self.get_lvw_feature_df(), self.df_targets, cv=cv, scoring=self.scoring, return_train_score=False)
    
    def get_lvw_feature_df(self):
        # returns feature subset using Las Vegas Wrapper (TODO: logic + method to retrieve used features)
        df_sub = self.df_features.drop('file_name', axis=1) # never use file name
        return df_sub

    
class ClassifierFactory:
    
    @staticmethod
    def get_metadata_classifiers(df_features, df_targets):
        return [BaseClassifier(KNeighborsClassifier(), df_features, df_targets, 'k-Nearest neighbors', 'Metadata'),
                BaseClassifier(NearestCentroid(), df_features, df_targets, 'Nearest mean classifier', 'Metadata'),
                BaseClassifier(DecisionTreeClassifier(), df_features, df_targets, 'Decision tree', 'Metadata'),
                BaseClassifier(LogisticRegression(), df_features, df_targets, 'Logistic regression', 'Metadata'),
                BaseClassifier(SVC(gamma='auto'), df_features, df_targets, 'SVM (Gaussian Kernel)', 'Metadata'),
                BaseClassifier(BaggingClassifier(), df_features, df_targets, 'Bagging', 'Metadata'),
                BaseClassifier(RandomForestClassifier(n_estimators=10), df_features, df_targets, 'Random Forest', 'Metadata'),
                BaseClassifier(AdaBoostClassifier(), df_features, df_targets, 'AdaBoost', 'Metadata'),
                BaseClassifier(GradientBoostingClassifier(), df_features, df_targets, 'Gradient Boosting Tree', 'Metadata')]
    
    @staticmethod
    def get_text_classifiers(df_features, df_targets):
        return [BaseClassifier(GaussianNB(), df_features, df_targets, 'Naive Bayes', 'Textual'),
                BaseClassifier(KNeighborsClassifier(), df_features, df_targets, 'k-Nearest neighbors', 'Textual'),
                BaseClassifier(SVC(gamma='auto'), df_features, df_targets, 'SVM (Gaussian Kernel)', 'Textual')]
    
    @staticmethod
    def get_visual_classifiers(df_features, df_targets):
        return [BaseClassifier(KNeighborsClassifier(), df_features, df_targets, 'k-Nearest neighbors', 'Visual'),
                BaseClassifier(DecisionTreeClassifier(), df_features, df_targets, 'Decision tree', 'Visual'),
                BaseClassifier(LogisticRegression(), df_features, df_targets, 'Logistic regression', 'Visual'),
                BaseClassifier(SVC(gamma='auto'), df_features, df_targets, 'SVM (Gaussian Kernel)', 'Visual'),
                BaseClassifier(RandomForestClassifier(n_estimators=10), df_features, df_targets, 'Random Forest', 'Visual'),
                BaseClassifier(AdaBoostClassifier(), df_features, df_targets, 'AdaBoost', 'Visual'),
                BaseClassifier(GradientBoostingClassifier(), df_features, df_targets, 'Gradient Boosting Tree', 'Visual')]
    
    @staticmethod
    def get_audio_classifiers(df_features, df_targets):
        return [BaseClassifier(LogisticRegression(), df_features, df_targets, 'Logistic regression', 'Audio'),
                BaseClassifier(GradientBoostingClassifier(), df_features, df_targets, 'Gradient Boosting Tree', 'Audio')]                
                

In [24]:
for clf in ClassifierFactory.get_text_classifiers(df_txt_train, df_targets_train):
    print(clf.cv())

Starting cross validation for classifier Naive Bayes and modality Textual
{'fit_time': array([0.05270934, 0.00700045, 0.00799942, 0.00900102, 0.00900102,
       0.00699902, 0.00799966, 0.0079987 , 0.00699949, 0.00800085]), 'score_time': array([0.02099776, 0.00400043, 0.00299883, 0.00399876, 0.00400043,
       0.00400043, 0.006001  , 0.00400162, 0.00300002, 0.00399852]), 'test_f1': array([0.27272727, 0.45454545, 0.3       , 0.66666667, 0.44444444,
       0.44444444, 0.66666667, 0.55555556, 0.22222222, 0.66666667]), 'test_precision': array([0.27272727, 0.45454545, 0.3       , 0.66666667, 0.44444444,
       0.44444444, 0.66666667, 0.55555556, 0.22222222, 0.66666667]), 'test_recall': array([0.27272727, 0.45454545, 0.3       , 0.66666667, 0.44444444,
       0.44444444, 0.66666667, 0.55555556, 0.22222222, 0.66666667])}
Starting cross validation for classifier k-Nearest neighbors and modality Textual
{'fit_time': array([0.00700235, 0.00499797, 0.00500011, 0.00400043, 0.00500202,
       0.0059

In [79]:
def majorityVoting(resMat):
    resMat=resMat.astype(int)
    res=[]
    for x in range(0,len(resMat)):
        res.append(np.bincount(resMat[x]).argmax())
        
    return res
    
def labelStacking(resMat):
    resMat=resMat.astype(int)
    res=[]
    for x in range(0,len(resMat)):
        res.append(np.bincount(resMat[x]).argmax())
        
    return res
    
    
d = np.zeros((3,4))    
d[0,0]=1
d[0,1]=1
d[0,2]=1
d[0,3]=0
d[2,2]=1

print(majorityVoting(d))


[1, 0, 0]
