# Evaluation of Results of "TUD-MMC at MediaEval 2016: Context of Experience task" by Wang & Liem

## Necessary imports

In [1]:
import pandas as pd
import numpy as np
import os.path
import xml.etree.ElementTree as ET

train_path = "res/coe_dataset_icpr/dev_set/"
test_path = "res/coe_dataset_icpr/test_set/"

audio_folder = "audio_descriptors/"
text_folder = "text_descriptors/"
vis_folder = "vis_descriptors/"
metadata_folder = "XML/"

train_entries_path = "res/CoeTraining.csv"

## Datasets

Features are built in the manner described in the paper of Wang & Liem or "Right Inflight? A Dataset for Exploring the Automatic
Prediction of Movies Suitable for a Watching Situation" (https://mmsys2016.itec.aau.at/papers/MMSYS/a45-riegler.pdf), if Wang & Liem do not provide any information.

This leads to following set-up:

Metadata: (language, year published, genre, country, runtime and age rating) - from XML -- 1-Hot Encoding for all categorical values<br>
Text: as is td-idf <br>
Audio: Averaged of all Frames (NaN to 0) - Mel-Frequency Cepstral Coefficients<br>
Visual: as is - Histogram of Oriented Gradients (HOG) gray, Color Moments, local binary patterns (LBP) and Gray Level Run Length Matrix

NOTE: Training data - invalid entry (2_states, also in test set), (Moulin_Rouge!.mp4, should be Moulin_Rouge! --> fixed)

In [2]:
df_base_train = pd.read_csv(train_entries_path)
df_base_train = df_base_train[df_base_train['file_name'] != '2_States'] # remove invalid entry
df_base_train.sort_values(by='file_name', inplace=True)
df_base_train.reset_index(inplace=True, drop=True)
df_targets_train = df_base_train['goodforairplanes'].astype(int)
df_base_train.head(5)

Unnamed: 0,movie_name,file_name,goodforairplanes
0,A Fish Called Wanda,A_Fish_Called_Wanda,1
1,A Goofy Movie,A_Goofy_Movie,0
2,A Million Ways to Die in the West,A_Million_Ways_to_Die_in_the_West,1
3,A Single Man,A_Single_Man,1
4,American Gangster,American_Gangster,1


## Feature extractors
As the dataset was built in a manner that would have been considered dirty already in 2002 a lot of feature extraction is done

In [None]:
def get_audio_features(file_name, use_train=True):
    """
        returns 1x14 dataframe, with averaged Mel-Frequency Cepstral Coefficients + file_name
    """
    base_path = train_path if use_train else test_path
    file_path = os.path.join(base_path, audio_folder, file_name + ".csv")
    if not os.path.isfile(file_path):
        print(file_name, " does not exist!")
        return pd.DataFrame(columns=[str(x) for x in range(13)] + ['file_name'])
    df_audio = pd.read_csv(file_path, header=None).T # transpose (columns are rows)
    df_audio = df_audio.fillna(0) # nan values are treated as 0
    df_audio = pd.DataFrame(df_audio.mean(axis=0)).T # average accross columns
    df_audio['file_name'] = file_name
    return df_audio

def get_all_audio_features(df, use_train=True):
    """
        returns nx14 dataframe, containing audio features for all movies
    """
    dfs = []
    for file_name in df['file_name']:
        dfs.append(get_audio_features(file_name, use_train))
    
    return pd.concat(dfs).reset_index(drop=True)

def get_all_text_features(df, use_train=True):
    """
        returns nx3284 dataframe, containing tf-idf features for all movies
        the dataset creators messed up - contains several terms multiple times
        ordered alphabetically (?) - Live_Nude_Girls and Transformers__Age_of_Extinction where switched (detected perchance)
    """
    base_path = train_path if use_train else test_path
    file_path = os.path.join(base_path, text_folder, "tdf_idf_dev.csv")
    df_txt = pd.read_csv(file_path)
    # the creators of the dataset missed how csv-files work - so we transpose and drop empty rows to get the correct format
    cols = df_txt.columns 
    df_txt = df_txt.T.dropna()
    df_txt.columns = cols
    df_txt.reset_index(inplace=True, drop=True)
    df_txt['file_name'] = sorted(df['file_name']) # we assume the info to be order alphabetically, as we do not have more info
    return df_txt.reset_index(drop=True)

def get_vis_features(file_name, use_train=True):
    """
        returns 1x1653 dataframe, with unspecified visual features + file_name
        we assume that every single value in the csv is one feature
        this may be wrong, as there are two rows and no documentation (again)
    """
    base_path = train_path if use_train else test_path
    file_path = os.path.join(base_path, vis_folder, file_name + ".csv")
    if not os.path.isfile(file_path):
        print(file_name, " does not exist!")
        return pd.DataFrame(columns=[str(x) for x in range(1652)] + ['file_name'])
    df_vis = pd.read_csv(file_path, header=None)
    df_vis = pd.DataFrame(pd.concat([df_vis.loc[0,:], df_vis.loc[1,:]])).reset_index(drop=True).T # treat each value as single feature (-> no aggregation)
    df_vis['file_name'] = file_name
    return df_vis

def get_all_vis_features(df, use_train=True):
    """
        returns nx1653 dataframe, containing visual features for all movies
    """
    dfs = []
    for file_name in df['file_name']:
        dfs.append(get_vis_features(file_name, use_train))
    
    return pd.concat(dfs).reset_index(drop=True)

def get_meta_features(file_name, use_train=True):
    """
        returns 1x7 dataframe, with metadata features + file_name
        One Hot Encoding is not applied here, this should happen later
    """
    base_path = train_path if use_train else test_path
    file_path = os.path.join(base_path, metadata_folder, file_name + ".xml")
    if not os.path.isfile(file_path):
        print(file_name, " does not exist!")
        return pd.DataFrame(columns=['country', 'genre', 'language', 'rated', 'runtime', 'year', 'file_name'])
    etree = ET.parse(file_path)
    movie = etree.getroot().find('movie')
    mv = {}
    mv['language'] = [movie.get('language')]
    mv['year'] = [int(movie.get('year'))]
    mv['genre'] = [movie.get('genre')]
    mv['country'] = [movie.get('country')]
    mv['runtime'] = [int(movie.get('runtime')[:-4])]
    mv['rated'] = [movie.get('rated')]

    df_meta = pd.DataFrame.from_dict(mv)
    df_meta['file_name'] = file_name
    
    return df_meta

def get_all_meta_features(df, use_train=True):
    """
        returns nx7 dataframe, containing metadata features for all movies
    """
    dfs = []
    for file_name in df['file_name']:
        dfs.append(get_meta_features(file_name, use_train))
    
    df_meta = pd.concat(dfs)
    
    df_country = df_meta.country.str.replace(' ','').str.get_dummies(sep=',')
    df_country.columns = ['country_' + x for x in df_country.columns]

    df_genre = df_meta.genre.str.replace(' ','').str.get_dummies(sep=',')
    df_genre.columns = ['genre_' + x for x in df_genre.columns]

    df_language = df_meta.language.str.replace(' ','').str.get_dummies(sep=',')
    df_language.columns = ['language_' + x for x in df_language.columns]

    df_rated = df_meta.rated.str.get_dummies(sep=',')
    df_rated.columns = ['rated_' + x for x in df_rated.columns]
    
    return pd.concat([df_country, df_genre, df_language, df_rated, df_meta[['runtime', 'year', 'file_name']]], axis=1).reset_index(drop=True)

In [None]:
df_audio_train = get_all_audio_features(df_base_train)
df_audio_train.head(5)

In [None]:
df_txt_train = get_all_text_features(df_base_train)
df_txt_train.head(5)

In [None]:
df_vis_train = get_all_vis_features(df_base_train)
df_vis_train.head(5)

In [None]:
df_meta_train = get_all_meta_features(df_base_train)
df_meta_train.head(5)

## Implementation fun

In [None]:
import random
def sample_gen(n, forbid):
    state = dict()
    track = dict()
    for (i, o) in enumerate(forbid):
        x = track.get(o, o)
        t = state.get(n-i-1, n-i-1)
        state[x] = t
        track[t] = x
        state.pop(n-i-1, None)
        track.pop(o, None)
    del track
    for remaining in range(n-len(forbid), 0, -1):
        i = random.randrange(remaining)
        yield state.get(i, i)
        state[i] = state.get(remaining - 1, remaining - 1)
        state.pop(remaining - 1, None)

def randomSet(S,n=0):
    Set = []
    if (n==0):
        n = len(S)
    rand = random.randint(1,n)
    return np.random.choice(S, rand ,replace=False)

def InconCheck(S,D,T):
    incon_value = 0
    data = D[S]
    data['target'] = T
    # create dataframe
    data_no = data.loc[data['target']==0]
    data_no = data_no.drop('target',axis=1)
    data_yes = data.loc[data['target']==1]
    data_yes = data_yes.drop('target',axis=1)
    no_df = pd.DataFrame(data_no.groupby(data_no.columns.tolist(),as_index=False).size().reset_index(name='target_no'))
    yes_df = pd.DataFrame(data_yes.groupby(data_yes.columns.tolist(),as_index=False).size().reset_index(name='target_yes'))

    result_df = pd.merge(no_df, yes_df, how='outer', on=S)
    result_df = result_df.fillna(0)
    result_df['inc'] = result_df['target_no']

    for index, row in result_df.iterrows():
        if (row['target_yes']>row['target_no']):
            result_df.iloc[index,result_df.columns=='inc'] = row['target_no']
        else:
            result_df.iloc[index,result_df.columns=='inc'] = row['target_yes']
    
    incon_value = result_df['inc'].sum()
    incon_value/len(D)

    return incon_value

def lvf(MAX_TRIES, D, TARGET_COLUMN, gamma):
    C_best = len(D)
    S_best = D.columns.values
    M_sets=[]
    
    for i in range(1,MAX_TRIES):
        S = sorted(randomSet(D.columns.values,C_best))
        C = len(S)
        if (C<C_best):
            if(InconCheck(S,D,TARGET_COLUMN)<gamma):
                M_sets=[]
                S_best = S
                C_best = C
                M_sets.append(S_best)
                print("")
                print("current best", S_best)
        elif ((C==C_best) and (InconCheck(S,D,TARGET_COLUMN)<gamma)) :
            M_sets.append(S)
            print("also current best", S)
    return M_sets

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB # put NOTE into paper - not sure if correct bayes
from sklearn.model_selection import StratifiedKFold # put NOTE into paper - better as common kfold sampling
from sklearn.base import clone
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from functools import reduce

CI_95_FACTOR = 1.96

class BaseClassifier:
    
    def __init__(self, clf, clf_name, modality):
        self.clf = clf
        self.clf_name = clf_name
        self.modality = modality
        
    def fit(self, df_features, df_targets, verbose=True):
        self.clf = clone(self.clf) # reset any previously trained model
        np.random.seed(32143421)
        if verbose:
            print(f"Starting training for classifier {self.clf_name} and modality {self.modality}")
        self.clf.fit(df_features, df_targets)
        
    def predict(self, df_features, verbose=True):
        if verbose:
            print(f"Starting prediction for classifier {self.clf_name} and modality {self.modality}")
        return self.clf.predict(df_features)    
        
class ClassifierFactory:
    
    @staticmethod
    def get_metadata_classifiers():
        return [BaseClassifier(KNeighborsClassifier(), 'k-Nearest neighbors', 'Metadata'),
                BaseClassifier(NearestCentroid(), 'Nearest mean classifier', 'Metadata'),
                BaseClassifier(DecisionTreeClassifier(), 'Decision tree', 'Metadata'),
                BaseClassifier(LogisticRegression(), 'Logistic regression', 'Metadata'),
                BaseClassifier(SVC(gamma='auto'), 'SVM (Gaussian Kernel)', 'Metadata'),
                BaseClassifier(BaggingClassifier(),'Bagging', 'Metadata'),
                BaseClassifier(RandomForestClassifier(n_estimators=10),'Random Forest', 'Metadata'),
                BaseClassifier(AdaBoostClassifier(), 'AdaBoost', 'Metadata'),
                BaseClassifier(GradientBoostingClassifier(), 'Gradient Boosting Tree', 'Metadata')]
    
    @staticmethod
    def get_text_classifiers():
        return [BaseClassifier(GaussianNB(), 'Naive Bayes', 'Textual'),
                BaseClassifier(KNeighborsClassifier(), 'k-Nearest neighbors', 'Textual'),
                BaseClassifier(SVC(gamma='auto'), 'SVM (Gaussian Kernel)', 'Textual')]
    
    @staticmethod
    def get_visual_classifiers():
        return [BaseClassifier(KNeighborsClassifier(), 'k-Nearest neighbors', 'Visual'),
                BaseClassifier(DecisionTreeClassifier(), 'Decision tree', 'Visual'),
                BaseClassifier(LogisticRegression(), 'Logistic regression', 'Visual'),
                BaseClassifier(SVC(gamma='auto'), 'SVM (Gaussian Kernel)', 'Visual'),
                BaseClassifier(RandomForestClassifier(n_estimators=10), 'Random Forest', 'Visual'),
                BaseClassifier(AdaBoostClassifier(), 'AdaBoost', 'Visual'),
                BaseClassifier(GradientBoostingClassifier(), 'Gradient Boosting Tree', 'Visual')]
    
    @staticmethod
    def get_audio_classifiers():
        return [BaseClassifier(LogisticRegression(), 'Logistic regression', 'Audio'),
                BaseClassifier(GradientBoostingClassifier(), 'Gradient Boosting Tree', 'Audio')]                
                
class DataWrapper:
    
    def __init__(self, df_audio, df_vis, df_txt, df_meta, df_targets):
        self.df_audio = df_audio
        self.df_vis = df_vis
        self.df_txt = df_txt
        self.df_meta = df_meta
        self.df_targets = df_targets
        
    def generate_subspace(self):
        # TODO insert LVW functionality
        # N = len(df_meta)
        # MAX_TRIES = 77*N^5
        # print(lvf(MAX_TRIES,df_meta, df_targets_train,5))
        
        self.df_audio = self.df_audio.drop('file_name', axis=1)
        self.df_vis = self.df_vis.drop('file_name', axis=1)
        self.df_txt = self.df_txt.drop('file_name', axis=1)
        self.df_meta = self.df_meta.drop('file_name', axis=1)
        
        
class OutputWrapper:
    
    def __init__(self, df_res_test, df_score_test, df_score_stats, df_res_all):
        self.df_res_test = df_res_test
        self.df_score_test = df_score_test
        self.df_score_stats = df_score_stats
        self.df_res_all = df_res_all
        
class Evaluator:
    # TODO add clf stacking method
    
    def __init__(self, data_wrapper, use_audio=True, use_visual=True, use_text=True, use_meta=True):
        self.data_wrapper = data_wrapper
        self.use_audio = use_audio
        self.use_visual = use_visual
        self.use_text = use_text
        self.use_meta = use_meta            
            
    def cv_modality(self, df_features, df_targets, clfs, cv=10, verbose=True, predict_all = False):
        kf = StratifiedKFold(n_splits=cv, random_state=9832432)
        df_cvs = []
        df_all_cvs = []
        i=1
        for train_index, test_index in kf.split(df_features, df_targets):
            if verbose:
                print(f"Performing CV fold {i}..")
            i += 1
            X_train, X_test = df_features.iloc[train_index,:], df_features.iloc[test_index,:]
            y_train, y_test = df_targets[train_index], df_targets[test_index]
            
            df_res = pd.DataFrame(y_test)
            df_res.columns = ['TARGET']
            
            # init df for prediction of all entries
            df_res_all = pd.DataFrame(df_targets)
            df_res_all.columns = ['TARGET']
            
            for clf in clfs:
                clf.fit(X_train, y_train, verbose)
                y_pred = clf.predict(X_test, verbose)
                df_res[clf.clf_name+"_"+clf.modality] = y_pred
                
                if predict_all:
                    y_pred_all = clf.predict(df_features, verbose)
                    df_res_all[clf.clf_name+"_"+clf.modality] = y_pred_all
                
            df_cvs.append(df_res)
            df_all_cvs.append(df_res_all)
        return df_cvs, df_all_cvs
            
    def cv(self, cv=10, verbose=True, predict_all=False):
        """
            set predict_all to True to also include predictions for all data
        """
        df_cvs = []
        df_all_cvs = []
        df_targets = self.data_wrapper.df_targets
        if (self.use_audio):
            df_test, df_all = self.cv_modality(
                self.data_wrapper.df_audio,
                df_targets,
                ClassifierFactory.get_audio_classifiers(),
                cv,
                verbose,
                predict_all)
            df_cvs.append(df_test)
            df_all_cvs.append(df_all)
        if (self.use_visual):
            df_test, df_all = self.cv_modality(
                self.data_wrapper.df_vis,
                df_targets,
                ClassifierFactory.get_visual_classifiers(),
                cv,
                verbose,
                predict_all)
            df_cvs.append(df_test)
            df_all_cvs.append(df_all)
        if (self.use_text):
            df_test, df_all = self.cv_modality(
                self.data_wrapper.df_txt,
                df_targets,
                ClassifierFactory.get_text_classifiers(),
                cv,
                verbose,
                predict_all)
            df_cvs.append(df_test)
            df_all_cvs.append(df_all)
        if (self.use_meta):
            df_test, df_all = self.cv_modality(
                self.data_wrapper.df_meta,
                df_targets,
                ClassifierFactory.get_metadata_classifiers(),
                cv,
                verbose,
                predict_all)
            df_cvs.append(df_test)
            df_all_cvs.append(df_all)
            
        df_c = []
        for i in range(len(df_cvs[0])):
            df_i = pd.concat([df_cvs[x][i] for x in range(len(df_cvs))], axis=1)
            df_i = df_i.loc[:,~df_i.columns.duplicated()]
            df_c.append(df_i)
            
        df_all_c = []
        if predict_all:
            for i in range(len(df_all_cvs[0])):
                df_i = pd.concat([df_all_cvs[x][i] for x in range(len(df_all_cvs))], axis=1)
                df_i = df_i.loc[:,~df_i.columns.duplicated()]
                df_all_c.append(df_i)
            
        df_res = self.evaluate(df_c)
        
        return OutputWrapper(df_c, df_res, self.overall_eval(df_res), df_all_c)

    def evaluate(self, df_res):
        """
        returns precision, recall and F1 in a DF
        returns list of DFs if df_res is list of DFs
        """
        if type(df_res) == type([]):
            return [self.evaluate_single(df_x) for df_x in df_res]
        else:
            return self.evaluate_single(df_res)

    def evaluate_single(self, df_res):
        cols = list(df_res.columns)
        cols.remove('TARGET')
        df_ev = pd.DataFrame(columns=['clf', 'precision', 'recall', 'f1'])
        for col in cols:
            prec = precision_score(df_res['TARGET'], df_res[col])
            recall = recall_score(df_res['TARGET'], df_res[col])
            f1 = f1_score(df_res['TARGET'], df_res[col])
            df_ev = df_ev.append({'clf':col, 'precision': prec, 'recall':recall, 'f1':f1}, ignore_index=True)
        return df_ev
    
    def overall_eval(self, df_results):
        df_score = pd.DataFrame(columns =['clf', 'mean_precision', 'mean_recall', 'mean_f1', 'var_precision', 'var_recall', 'var_f1', 'std_precision', 'std_recall', 'std_f1'])
        df_score['clf'] = df_results[0]['clf']
        # mean 
        df_res = reduce(lambda x, y: x.add(y, fill_value=0), df_results)
        df_score[['mean_precision', 'mean_recall', 'mean_f1']] = df_res[['precision', 'recall', 'f1']].div(len(df_results))

        # var
        df_mean = df_score[['mean_precision', 'mean_recall', 'mean_f1']]
        df_mean.columns = ['precision', 'recall', 'f1']
        df_dev = []
        for df in df_results:
            df_dev.append(np.square(df[['precision', 'recall', 'f1']].subtract(df_mean)))
        df_dev = reduce(lambda x, y: x.add(y, fill_value=0), df_dev)
        df_score[['var_precision', 'var_recall', 'var_f1']] = df_dev[['precision', 'recall', 'f1']].div(len(df_results))

        # std 
        df_score[['std_precision', 'std_recall', 'std_f1']] = np.power(df_score[['var_precision', 'var_recall', 'var_f1']], 0.5)
        
        # 95-ci
        for metric in ['precision', 'recall', 'f1']:
            l_ci = df_score['mean_'+metric] - df_score['std_'+metric]*CI_95_FACTOR/len(df_results)
            u_ci = df_score['mean_'+metric] + df_score['std_'+metric]*CI_95_FACTOR/len(df_results)
            df_score['l_95ci_'+metric] = l_ci
            df_score['u_95ci_'+metric] = u_ci

        return df_score

## Call Example

In [None]:
dw = DataWrapper(df_audio_train, df_vis_train, df_txt_train, df_meta_train, df_targets_train)
dw.generate_subspace()
ev = Evaluator(dw) # TODO preprocessing for meta needs to be done
ow = ev.cv(verbose=False, predict_all=True)

In [None]:
ow.df_res_test[0]

In [None]:
ow.df_score_test[0]        

In [None]:
ow.df_score_stats

In [None]:
ow.df_res_all[0]