# Evaluation of Results of "TUD-MMC at MediaEval 2016: Context of Experience task" by Wang & Liem

## Necessary imports

In [1]:
import pandas as pd
import numpy as np
import os.path
import xml.etree.ElementTree as ET

train_path = "res/coe_dataset_icpr/dev_set/"
test_path = "res/coe_dataset_icpr/test_set/"

audio_folder = "audio_descriptors/"
text_folder = "text_descriptors/"
vis_folder = "vis_descriptors/"
metadata_folder = "XML/"

train_entries_path = "res/CoeTraining.csv"

## Datasets

Features are built in the manner described in the paper of Wang & Liem or "Right Inflight? A Dataset for Exploring the Automatic
Prediction of Movies Suitable for a Watching Situation" (https://mmsys2016.itec.aau.at/papers/MMSYS/a45-riegler.pdf), if Wang & Liem do not provide any information.

This leads to following set-up:

Metadata: (language, year published, genre, country, runtime and age rating) - from XML -- 1-Hot Encoding for all categorical values<br>
Text: as is td-idf <br>
Audio: Averaged of all Frames (NaN to 0) - Mel-Frequency Cepstral Coefficients<br>
Visual: as is - Histogram of Oriented Gradients (HOG) gray, Color Moments, local binary patterns (LBP) and Gray Level Run Length Matrix

NOTE: Training data - invalid entry (2_states, also in test set), (Moulin_Rouge!.mp4, should be Moulin_Rouge! --> fixed)

In [2]:
df_base_train = pd.read_csv(train_entries_path)
df_base_train = df_base_train[df_base_train['file_name'] != '2_States'] # remove invalid entry
df_base_train.sort_values(by='file_name', inplace=True)
df_base_train.reset_index(inplace=True, drop=True)
df_targets_train = df_base_train['goodforairplanes'].astype(int)
df_base_train.head(5)

Unnamed: 0,movie_name,file_name,goodforairplanes
0,A Fish Called Wanda,A_Fish_Called_Wanda,1
1,A Goofy Movie,A_Goofy_Movie,0
2,A Million Ways to Die in the West,A_Million_Ways_to_Die_in_the_West,1
3,A Single Man,A_Single_Man,1
4,American Gangster,American_Gangster,1


## Feature extractors
As the dataset was built in a manner that would have been considered dirty already in 2002 a lot of feature extraction is done

In [15]:
def get_audio_features(file_name, use_train=True):
    """
        returns 1x14 dataframe, with averaged Mel-Frequency Cepstral Coefficients + file_name
    """
    base_path = train_path if use_train else test_path
    file_path = os.path.join(base_path, audio_folder, file_name + ".csv")
    if not os.path.isfile(file_path):
        print(file_name, " does not exist!")
        return pd.DataFrame(columns=[str(x) for x in range(13)] + ['file_name'])
    df_audio = pd.read_csv(file_path, header=None).T # transpose (columns are rows)
    df_audio = df_audio.fillna(0) # nan values are treated as 0
    df_audio = pd.DataFrame(df_audio.mean(axis=0)).T # average accross columns
    df_audio['file_name'] = file_name
    return df_audio

def get_all_audio_features(df, use_train=True):
    """
        returns nx14 dataframe, containing audio features for all movies
    """
    dfs = []
    for file_name in df['file_name']:
        dfs.append(get_audio_features(file_name, use_train))
    
    return pd.concat(dfs).reset_index(drop=True)

def get_all_text_features(df, use_train=True):
    """
        returns nx3284 dataframe, containing tf-idf features for all movies
        the dataset creators messed up - contains several terms multiple times
        ordered alphabetically (?) - Live_Nude_Girls and Transformers__Age_of_Extinction where switched (detected perchance)
    """
    base_path = train_path if use_train else test_path
    file_path = os.path.join(base_path, text_folder, "tdf_idf_dev.csv")
    df_txt = pd.read_csv(file_path)
    # the creators of the dataset missed how csv-files work - so we transpose and drop empty rows to get the correct format
    cols = df_txt.columns 
    df_txt = df_txt.T.dropna()
    df_txt.columns = cols
    df_txt.reset_index(inplace=True, drop=True)
    df_txt['file_name'] = sorted(df['file_name']) # we assume the info to be order alphabetically, as we do not have more info
    return df_txt

def get_vis_features(file_name, use_train=True):
    """
        returns 1x1653 dataframe, with unspecified visual features + file_name
        we assume that every single value in the csv is one feature
        this may be wrong, as there are two rows and no documentation (again)
    """
    base_path = train_path if use_train else test_path
    file_path = os.path.join(base_path, vis_folder, file_name + ".csv")
    if not os.path.isfile(file_path):
        print(file_name, " does not exist!")
        return pd.DataFrame(columns=[str(x) for x in range(1652)] + ['file_name'])
    df_vis = pd.read_csv(file_path, header=None)
    df_vis = pd.DataFrame(pd.concat([df_vis.loc[0,:], df_vis.loc[1,:]])).reset_index(drop=True).T # treat each value as single feature (-> no aggregation)
    df_vis['file_name'] = file_name
    return df_vis

def get_all_vis_features(df, use_train=True):
    """
        returns nx1653 dataframe, containing visual features for all movies
    """
    dfs = []
    for file_name in df['file_name']:
        dfs.append(get_vis_features(file_name, use_train))
    
    return pd.concat(dfs).reset_index(drop=True)

def get_meta_features(file_name, use_train=True):
    """
        returns 1x7 dataframe, with metadata features + file_name
        One Hot Encoding is not applied here, this should happen later
    """
    base_path = train_path if use_train else test_path
    file_path = os.path.join(base_path, metadata_folder, file_name + ".xml")
    if not os.path.isfile(file_path):
        print(file_name, " does not exist!")
        return pd.DataFrame(columns=['country', 'genre', 'language', 'rated', 'runtime', 'year', 'file_name'])
    etree = ET.parse(file_path)
    movie = etree.getroot().find('movie')
    mv = {}
    mv['language'] = [movie.get('language')]
    mv['year'] = [int(movie.get('year'))]
    mv['genre'] = [movie.get('genre')]
    mv['country'] = [movie.get('country')]
    mv['runtime'] = [int(movie.get('runtime')[:-4])]
    mv['rated'] = [movie.get('rated')]

    df_meta = pd.DataFrame.from_dict(mv)
    df_meta['file_name'] = file_name
    
    return df_meta

def get_all_meta_features(df, use_train=True):
    """
        returns nx7 dataframe, containing metadata features for all movies
    """
    dfs = []
    for file_name in df['file_name']:
        dfs.append(get_meta_features(file_name, use_train))
    
    df_meta = pd.concat(dfs)
    
    df_country = df_meta.country.str.replace(' ','').str.get_dummies(sep=',')
    df_country.columns = ['country_' + x for x in df_country.columns]

    df_genre = df_meta.genre.str.replace(' ','').str.get_dummies(sep=',')
    df_genre.columns = ['genre_' + x for x in df_genre.columns]

    df_language = df_meta.language.str.replace(' ','').str.get_dummies(sep=',')
    df_language.columns = ['language_' + x for x in df_language.columns]

    df_rated = df_meta.rated.str.get_dummies(sep=',')
    df_rated.columns = ['rated_' + x for x in df_rated.columns]
    
    return pd.concat([df_country, df_genre, df_language, df_rated, df_meta[['runtime', 'year', 'file_name']]], axis=1).reset_index(drop=True)

In [4]:
df_audio_train = get_all_audio_features(df_base_train)
df_audio_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,file_name
0,66.828814,-5.674521,1.670346,1.143263,-0.635255,1.269376,0.633811,0.012407,0.164257,-1.494813,-0.292583,0.086875,-0.914582,A_Fish_Called_Wanda
1,43.908715,-6.380989,1.36179,-1.483354,-0.67004,-1.626537,0.466197,-1.888169,0.848654,-0.990286,0.673634,-0.972964,-0.141539,A_Goofy_Movie
2,3.390978,-6.725758,0.579762,-0.271885,-0.17564,-0.84569,-0.699064,-0.578434,0.537249,-1.387373,0.747223,-0.88758,-0.205273,A_Million_Ways_to_Die_in_the_West
3,57.743484,-3.722123,2.780418,0.756402,0.043743,-0.960622,-0.435575,-0.176729,1.665236,-2.068548,1.211791,-0.358194,0.738827,A_Single_Man
4,65.354709,-5.609515,-1.303409,-0.831993,-0.518848,-0.019373,-0.500203,-0.897985,0.148561,-0.666728,0.033135,0.383797,-0.209412,American_Gangster


In [166]:
df_txt_train = get_all_text_features(df_base_train)
df_txt_train.head()

Unnamed: 0,24000,baby,baseball,big,doc,escort,frozen,heroes,high,huck,...,york,yorks,young,young.1,younger,youngja,zebra,zellweger,zoologists,file_name
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A_Fish_Called_Wanda
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A_Goofy_Movie
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A_Million_Ways_to_Die_in_the_West
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A_Single_Man
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.051657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,American_Gangster


In [140]:
df_vis_train = get_all_vis_features(df_base_train)
df_vis_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1643,1644,1645,1646,1647,1648,1649,1650,1651,file_name
0,0.430310,0.381010,0.340820,0.316420,0.414650,0.385990,0.329380,0.312120,0.350670,0.342460,...,362.8300,8.592300,9.142700,8.410100,8.792400,1483.3,417.21,892.59,435.28,A_Fish_Called_Wanda
1,0.002031,0.000000,0.000000,0.073020,0.027533,0.005346,0.006015,0.118240,0.026991,0.005171,...,20278.0000,0.972010,1.365400,1.803200,1.463400,168740.0,20896.00,34434.00,20967.00,A_Goofy_Movie
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,119950.0000,0.000001,0.002466,0.000004,0.002466,729320.0,119950.00,230400.00,119950.00,A_Million_Ways_to_Die_in_the_West
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,9550.7000,5.943300,6.021500,4.383900,5.428800,32242.0,10313.00,22308.00,9850.00,A_Single_Man
4,0.231640,0.286290,0.300680,0.281180,0.244890,0.278490,0.290760,0.293800,0.161040,0.166850,...,109.6200,17.237000,15.543000,13.671000,15.231000,53559.0,8637.40,18597.00,8679.00,American_Gangster
5,0.237360,0.250190,0.255130,0.260250,0.193180,0.208400,0.295470,0.363410,0.241420,0.258170,...,112.0300,1.481500,2.082400,1.675600,2.140800,2639.4,1377.00,4321.30,1299.10,American_Pie
6,0.000000,0.294160,0.290070,0.011351,0.100930,0.355760,0.364370,0.236320,0.100780,0.347580,...,20076.0000,9.715800,10.036000,7.630700,11.156000,66445.0,27016.00,60427.00,20459.00,Andaz_Apna_Apna
7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,119950.0000,0.000001,0.002466,0.000004,0.002466,729320.0,119950.00,230400.00,119950.00,Anna_Karenina
8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,55609.0000,0.000004,0.003916,0.000008,0.003916,230400.0,55609.00,129600.00,55609.00,Babar__The_Movie
9,0.229340,0.136600,0.156360,0.361430,0.220950,0.237900,0.268820,0.360830,0.146440,0.232170,...,3027.9000,9.214400,10.289000,9.330100,9.628400,38404.0,10548.00,24415.00,10098.00,Bhoothnath_Returns


In [18]:
df_meta_train = get_all_meta_features(df_base_train)
df_meta_train.head(5)

Unnamed: 0,country_Argentina,country_Australia,country_Bahamas,country_Canada,country_China,country_CzechRepublic,country_Egypt,country_France,country_Germany,country_India,...,rated_G,rated_N/A,rated_NOT RATED,rated_PG,rated_PG-13,rated_R,rated_TV-MA,runtime,year,file_name
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,108,1988,A_Fish_Called_Wanda
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,78,1995,A_Goofy_Movie
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,116,2014,A_Million_Ways_to_Die_in_the_West
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,99,2009,A_Single_Man
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,157,2007,American_Gangster


## Implementation fun

In [167]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB # put NOTE into paper - not sure if correct bayes
from sklearn.model_selection import StratifiedKFold # put NOTE into paper - better as common kfold sampling
from sklearn.base import clone
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from functools import reduce

CI_95_FACTOR = 1.96

class BaseClassifier:
    
    def __init__(self, clf, clf_name, modality):
        self.clf = clf
        self.clf_name = clf_name
        self.modality = modality
        
    def fit(self, df_features, df_targets, verbose=True):
        self.clf = clone(self.clf) # reset any previously trained model
        np.random.seed(32143421)
        if verbose:
            print(f"Starting training for classifier {self.clf_name} and modality {self.modality}")
        self.clf.fit(df_features, df_targets)
        
    def predict(self, df_features, verbose=True):
        if verbose:
            print(f"Starting prediction for classifier {self.clf_name} and modality {self.modality}")
        return self.clf.predict(df_features)    
        
class ClassifierFactory:
    
    @staticmethod
    def get_metadata_classifiers():
        return [BaseClassifier(KNeighborsClassifier(), 'k-Nearest neighbors', 'Metadata'),
                BaseClassifier(NearestCentroid(), 'Nearest mean classifier', 'Metadata'),
                BaseClassifier(DecisionTreeClassifier(), 'Decision tree', 'Metadata'),
                BaseClassifier(LogisticRegression(), 'Logistic regression', 'Metadata'),
                BaseClassifier(SVC(gamma='auto'), 'SVM (Gaussian Kernel)', 'Metadata'),
                BaseClassifier(BaggingClassifier(),'Bagging', 'Metadata'),
                BaseClassifier(RandomForestClassifier(n_estimators=10),'Random Forest', 'Metadata'),
                BaseClassifier(AdaBoostClassifier(), 'AdaBoost', 'Metadata'),
                BaseClassifier(GradientBoostingClassifier(), 'Gradient Boosting Tree', 'Metadata')]
    
    @staticmethod
    def get_text_classifiers():
        return [BaseClassifier(GaussianNB(), 'Naive Bayes', 'Textual'),
                BaseClassifier(KNeighborsClassifier(), 'k-Nearest neighbors', 'Textual'),
                BaseClassifier(SVC(gamma='auto'), 'SVM (Gaussian Kernel)', 'Textual')]
    
    @staticmethod
    def get_visual_classifiers():
        return [BaseClassifier(KNeighborsClassifier(), 'k-Nearest neighbors', 'Visual'),
                BaseClassifier(DecisionTreeClassifier(), 'Decision tree', 'Visual'),
                BaseClassifier(LogisticRegression(), 'Logistic regression', 'Visual'),
                BaseClassifier(SVC(gamma='auto'), 'SVM (Gaussian Kernel)', 'Visual'),
                BaseClassifier(RandomForestClassifier(n_estimators=10), 'Random Forest', 'Visual'),
                BaseClassifier(AdaBoostClassifier(), 'AdaBoost', 'Visual'),
                BaseClassifier(GradientBoostingClassifier(), 'Gradient Boosting Tree', 'Visual')]
    
    @staticmethod
    def get_audio_classifiers():
        return [BaseClassifier(LogisticRegression(), 'Logistic regression', 'Audio'),
                BaseClassifier(GradientBoostingClassifier(), 'Gradient Boosting Tree', 'Audio')]                
                
class DataWrapper:
    
    def __init__(self, df_audio, df_vis, df_txt, df_meta, df_targets):
        self.df_audio = df_audio
        self.df_vis = df_vis
        self.df_txt = df_txt
        self.df_meta = df_meta
        self.df_targets = df_targets
        
    def generate_subspace(self):
        # TODO insert LVW functionality
        self.df_audio = self.df_audio.drop('file_name', axis=1)
        self.df_vis = self.df_vis.drop('file_name', axis=1)
        self.df_txt = self.df_txt.drop('file_name', axis=1)
        self.df_meta = self.df_meta.drop('file_name', axis=1)
        
        
class OutputWrapper:
    
    def __init__(self, df_res_test, df_score_test, df_score_stats, df_res_all):
        self.df_res_test = df_res_test
        self.df_score_test = df_score_test
        self.df_score_stats = df_score_stats
        self.df_res_all = df_res_all
        
class Evaluator:
    # TODO add clf stacking method
    
    def __init__(self, data_wrapper, use_audio=True, use_visual=True, use_text=True, use_meta=True):
        self.data_wrapper = data_wrapper
        self.use_audio = use_audio
        self.use_visual = use_visual
        self.use_text = use_text
        self.use_meta = use_meta            
            
    def cv_modality(self, df_features, df_targets, clfs, cv=10, verbose=True, predict_all = False):
        kf = StratifiedKFold(n_splits=cv, random_state=9832432)
        df_cvs = []
        df_all_cvs = []
        i=1
        for train_index, test_index in kf.split(df_features, df_targets):
            if verbose:
                print(f"Performing CV fold {i}..")
            i += 1
            X_train, X_test = df_features.iloc[train_index,:], df_features.iloc[test_index,:]
            y_train, y_test = df_targets[train_index], df_targets[test_index]
            
            df_res = pd.DataFrame(y_test)
            df_res.columns = ['TARGET']
            
            # init df for prediction of all entries
            df_res_all = pd.DataFrame(df_targets)
            df_res_all.columns = ['TARGET']
            
            for clf in clfs:
                clf.fit(X_train, y_train, verbose)
                y_pred = clf.predict(X_test, verbose)
                df_res[clf.clf_name+"_"+clf.modality] = y_pred
                
                if predict_all:
                    y_pred_all = clf.predict(df_features, verbose)
                    df_res_all[clf.clf_name+"_"+clf.modality] = y_pred_all
                
            df_cvs.append(df_res)
            df_all_cvs.append(df_res_all)
        return df_cvs, df_all_cvs
            
    def cv(self, cv=10, verbose=True, predict_all=False):
        """
            set predict_all to True to also include predictions for all data
        """
        df_cvs = []
        df_all_cvs = []
        df_targets = self.data_wrapper.df_targets
        if (self.use_audio):
            df_test, df_all = self.cv_modality(
                self.data_wrapper.df_audio,
                df_targets,
                ClassifierFactory.get_audio_classifiers(),
                cv,
                verbose,
                predict_all)
            df_cvs.append(df_test)
            df_all_cvs.append(df_all)
        if (self.use_visual):
            df_test, df_all = self.cv_modality(
                self.data_wrapper.df_vis,
                df_targets,
                ClassifierFactory.get_visual_classifiers(),
                cv,
                verbose,
                predict_all)
            df_cvs.append(df_test)
            df_all_cvs.append(df_all)
        if (self.use_text):
            df_test, df_all = self.cv_modality(
                self.data_wrapper.df_txt,
                df_targets,
                ClassifierFactory.get_text_classifiers(),
                cv,
                verbose,
                predict_all)
            df_cvs.append(df_test)
            df_all_cvs.append(df_all)
        if (self.use_meta):
            df_test, df_all = self.cv_modality(
                self.data_wrapper.df_meta,
                df_targets,
                ClassifierFactory.get_metadata_classifiers(),
                cv,
                verbose,
                predict_all)
            df_cvs.append(df_test)
            df_all_cvs.append(df_all)
            
        df_c = []
        for i in range(len(df_cvs[0])):
            df_i = pd.concat([df_cvs[x][i] for x in range(len(df_cvs))], axis=1)
            df_i = df_i.loc[:,~df_i.columns.duplicated()]
            df_c.append(df_i)
            
        df_all_c = []
        if predict_all:
            for i in range(len(df_all_cvs[0])):
                df_i = pd.concat([df_all_cvs[x][i] for x in range(len(df_all_cvs))], axis=1)
                df_i = df_i.loc[:,~df_i.columns.duplicated()]
                df_all_c.append(df_i)
            
        df_res = self.evaluate(df_c)
        
        return OutputWrapper(df_c, df_res, self.overall_eval(df_res), df_all_c)

    def evaluate(self, df_res):
        """
        returns precision, recall and F1 in a DF
        returns list of DFs if df_res is list of DFs
        """
        if type(df_res) == type([]):
            return [self.evaluate_single(df_x) for df_x in df_res]
        else:
            return self.evaluate_single(df_res)

    def evaluate_single(self, df_res):
        cols = list(df_res.columns)
        cols.remove('TARGET')
        df_ev = pd.DataFrame(columns=['clf', 'precision', 'recall', 'f1'])
        for col in cols:
            prec = precision_score(df_res['TARGET'], df_res[col])
            recall = recall_score(df_res['TARGET'], df_res[col])
            f1 = f1_score(df_res['TARGET'], df_res[col])
            df_ev = df_ev.append({'clf':col, 'precision': prec, 'recall':recall, 'f1':f1}, ignore_index=True)
        return df_ev
    
    def overall_eval(self, df_results):
        df_score = pd.DataFrame(columns =['clf', 'mean_precision', 'mean_recall', 'mean_f1', 'var_precision', 'var_recall', 'var_f1', 'std_precision', 'std_recall', 'std_f1'])
        df_score['clf'] = df_results[0]['clf']
        # mean 
        df_res = reduce(lambda x, y: x.add(y, fill_value=0), df_results)
        df_score[['mean_precision', 'mean_recall', 'mean_f1']] = df_res[['precision', 'recall', 'f1']].div(len(df_results))

        # var
        df_mean = df_score[['mean_precision', 'mean_recall', 'mean_f1']]
        df_mean.columns = ['precision', 'recall', 'f1']
        df_dev = []
        for df in df_results:
            df_dev.append(np.square(df[['precision', 'recall', 'f1']].subtract(df_mean)))
        df_dev = reduce(lambda x, y: x.add(y, fill_value=0), df_dev)
        df_score[['var_precision', 'var_recall', 'var_f1']] = df_dev[['precision', 'recall', 'f1']].div(len(df_results))

        # std 
        df_score[['std_precision', 'std_recall', 'std_f1']] = np.power(df_score[['var_precision', 'var_recall', 'var_f1']], 0.5)
        
        # 95-ci
        for metric in ['precision', 'recall', 'f1']:
            l_ci = df_score['mean_'+metric] - df_score['std_'+metric]*CI_95_FACTOR/len(df_results)
            u_ci = df_score['mean_'+metric] + df_score['std_'+metric]*CI_95_FACTOR/len(df_results)
            df_score['l_95ci_'+metric] = l_ci
            df_score['u_95ci_'+metric] = u_ci

        return df_score

## Call Example

In [175]:
dw = DataWrapper(df_audio_train, df_vis_train, df_txt_train, df_meta_train, df_targets_train)
dw.generate_subspace()
ev = Evaluator(dw) # TODO preprocessing for meta needs to be done
ow = ev.cv(verbose=False, predict_all=True)



In [21]:
ow.df_res_test[0]

Unnamed: 0,TARGET,Logistic regression_Audio,Gradient Boosting Tree_Audio,k-Nearest neighbors_Visual,Decision tree_Visual,Logistic regression_Visual,SVM (Gaussian Kernel)_Visual,Random Forest_Visual,AdaBoost_Visual,Gradient Boosting Tree_Visual,...,SVM (Gaussian Kernel)_Textual,k-Nearest neighbors_Metadata,Nearest mean classifier_Metadata,Decision tree_Metadata,Logistic regression_Metadata,SVM (Gaussian Kernel)_Metadata,Bagging_Metadata,Random Forest_Metadata,AdaBoost_Metadata,Gradient Boosting Tree_Metadata
0,1,0,0,1,0,1,1,1,1,1,...,1,1,1,0,0,1,0,0,0,0
1,0,1,0,1,0,0,1,1,0,0,...,1,0,1,0,1,1,0,0,0,1
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,1,1,0,0,0,0,1,1,1,1,...,1,0,1,0,1,0,0,0,0,0
4,1,1,0,0,0,1,1,0,1,1,...,1,0,0,0,0,0,0,0,0,0
5,1,1,0,1,1,1,1,1,0,1,...,1,0,1,0,1,0,0,0,0,1
6,1,1,1,1,1,0,1,1,1,1,...,1,0,0,0,0,0,0,0,0,0
8,0,1,0,1,0,1,1,0,0,0,...,1,0,1,1,1,1,0,0,0,0
9,0,1,1,1,1,1,1,1,1,1,...,1,0,0,1,0,0,0,0,0,0
13,0,0,0,1,0,0,1,0,1,1,...,1,0,0,1,0,0,0,1,0,0


In [11]:
ow.df_score_test[0]        

Unnamed: 0,clf,precision,recall,f1
0,Logistic regression_Audio,0.555556,0.833333,0.666667
1,Gradient Boosting Tree_Audio,0.5,0.333333,0.4
2,k-Nearest neighbors_Visual,0.444444,0.666667,0.533333
3,Decision tree_Visual,0.6,0.5,0.545455
4,Logistic regression_Visual,0.571429,0.666667,0.615385
5,SVM (Gaussian Kernel)_Visual,0.545455,1.0,0.705882
6,Random Forest_Visual,0.714286,0.833333,0.769231
7,AdaBoost_Visual,0.625,0.833333,0.714286
8,Gradient Boosting Tree_Visual,0.666667,1.0,0.8
9,Naive Bayes_Textual,0.333333,0.333333,0.333333


In [12]:
ow.df_score_stats

Unnamed: 0,clf,mean_precision,mean_recall,mean_f1,var_precision,var_recall,var_f1,std_precision,std_recall,std_f1,l_95ci_precision,u_95ci_precision,l_95ci_recall,u_95ci_recall,l_95ci_f1,u_95ci_f1
0,Logistic regression_Audio,0.566508,0.57,0.547244,0.047184,0.048989,0.03009,0.217218,0.221334,0.173464,0.523933,0.609083,0.526618,0.613382,0.513245,0.581243
1,Gradient Boosting Tree_Audio,0.486429,0.416667,0.438236,0.050547,0.054944,0.045175,0.224826,0.234402,0.212543,0.442363,0.530494,0.370724,0.46261,0.396578,0.479895
2,k-Nearest neighbors_Visual,0.568492,0.633333,0.588811,0.015035,0.035778,0.017594,0.122618,0.18915,0.132641,0.544459,0.592525,0.59626,0.670407,0.562814,0.614809
3,Decision tree_Visual,0.550476,0.623333,0.581818,0.023185,0.043567,0.0302,0.152265,0.208726,0.173782,0.520632,0.58032,0.582423,0.664244,0.547757,0.615879
4,Logistic regression_Visual,0.613571,0.636667,0.608761,0.033807,0.0481,0.027895,0.183868,0.219317,0.167017,0.577533,0.649609,0.593681,0.679653,0.576025,0.641496
5,SVM (Gaussian Kernel)_Visual,0.543434,0.983333,0.699755,0.000481,0.0025,0.00082,0.021922,0.05,0.028633,0.539138,0.547731,0.973533,0.993133,0.694143,0.705367
6,Random Forest_Visual,0.619762,0.653333,0.629573,0.006054,0.0196,0.008414,0.077811,0.14,0.09173,0.604511,0.635013,0.625893,0.680773,0.611594,0.647552
7,AdaBoost_Visual,0.584643,0.61,0.593169,0.018714,0.025789,0.019354,0.136799,0.160589,0.139118,0.55783,0.611455,0.578525,0.641475,0.565902,0.620436
8,Gradient Boosting Tree_Visual,0.594524,0.726667,0.64641,0.009215,0.0324,0.012671,0.095997,0.18,0.112566,0.575708,0.613339,0.691387,0.761947,0.624347,0.668473
9,Naive Bayes_Textual,0.535,0.62,0.550276,0.036233,0.055156,0.022024,0.190351,0.234852,0.148407,0.497691,0.572309,0.573969,0.666031,0.521189,0.579364


In [13]:
ow.df_res_all[0]

Unnamed: 0,TARGET,Logistic regression_Audio,Gradient Boosting Tree_Audio,k-Nearest neighbors_Visual,Decision tree_Visual,Logistic regression_Visual,SVM (Gaussian Kernel)_Visual,Random Forest_Visual,AdaBoost_Visual,Gradient Boosting Tree_Visual,...,SVM (Gaussian Kernel)_Textual,k-Nearest neighbors_Metadata,Nearest mean classifier_Metadata,Decision tree_Metadata,Logistic regression_Metadata,SVM (Gaussian Kernel)_Metadata,Bagging_Metadata,Random Forest_Metadata,AdaBoost_Metadata,Gradient Boosting Tree_Metadata
0,1,0,0,1,0,1,1,1,1,1,...,1,1,1,0,0,1,0,0,0,0
1,0,1,0,1,0,0,1,1,0,0,...,1,0,1,0,1,1,0,0,0,1
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,1,1,0,0,0,0,1,1,1,1,...,1,0,1,0,1,0,0,0,0,0
4,1,1,0,0,0,1,1,0,1,1,...,1,0,0,0,0,0,0,0,0,0
5,1,1,0,1,1,1,1,1,0,1,...,1,0,1,0,1,0,0,0,0,1
6,1,1,1,1,1,0,1,1,1,1,...,1,0,0,0,0,0,0,0,0,0
7,1,0,1,1,1,1,1,1,1,1,...,1,0,0,1,1,1,1,1,1,1
8,0,1,0,1,0,1,1,0,0,0,...,1,0,1,1,1,1,0,0,0,0
9,0,1,1,1,1,1,1,1,1,1,...,1,0,0,1,0,0,0,0,0,0


In [192]:
def majorityVoting2(cvs_results):
    results=[]
    for fold in range(0,len(cvs_results[0])):
        df_vars=cvs_results[0][fold]
        df_labels=cvs_results[0][fold]['TARGET']
        df_labels=df_labels.reset_index(drop=True)
        for x in range(1,len(cvs_results)):
            tmp=cvs_results[x][fold]
            tmp=tmp.drop(['TARGET'],axis = 1)
            df_vars=pd.concat([df_vars,tmp],axis = 1)
            
        res=df_vars.mode(axis=1)[0]
        res=res.reset_index(drop=True)
        res=pd.to_numeric(res)
        df_labels=pd.to_numeric(df_labels)   
        
        #results.append(res)
        precision = precision_score(df_labels, res)
        recall = recall_score(df_labels,res)
        f1 = f1_score(df_labels,res)
        results.append([precision,recall,f1])
        
        #print(precision,recall,f1)
        
    return results
   
def majorityVoting(cvs_results,droppedClfs):
    results=[]
    for fold in range(0,len(cvs_results)):
        df_vars=cvs_results[fold]
        df_vars=df_vars.drop(['TARGET'],axis=1)
        
        for clf in droppedClfs:
            df_vars= df_vars.drop([clf],axis = 1)
        
        df_labels=cvs_results[fold]['TARGET']
        
        res=df_vars.mode(axis=1)[0]
        res=pd.to_numeric(res)
        df_labels=pd.to_numeric(df_labels)   
        
        precision = precision_score(df_labels, res)
        recall = recall_score(df_labels,res)
        f1 = f1_score(df_labels,res)
        results.append([precision,recall,f1])
        
    return results
   
    
    
def labelStacking(df_all_folds,df_test_folds,droppedClfs):
    results=[]
    for fold in range(0,len(df_all_folds)):
        df_all = df_all_folds[fold]
        df_test = df_test_folds[fold]
        df_new=df_all.merge(df_test, how='left', indicator=True)
        df_train = df_new[df_new['_merge'] == 'left_only']
        #print(df_train)
        df_train = df_train.drop(['_merge'],axis = 1)
        #clf=SVC(gamma='auto')
        #clf=LogisticRegression()
        #clf=DecisionTreeClassifier()
        #clf=NearestCentroid()
        #clf=RandomForestClassifier(n_estimators=100)
        clf=KNeighborsClassifier()
        #clf=AdaBoostClassifier()
        #clf=GradientBoostingClassifier()
        X_train = df_train.loc[:, df_train.columns != 'TARGET']
        y_train = df_train['TARGET']
        
        X_test = df_test.loc[:, df_test.columns != 'TARGET']
        y_test = df_test['TARGET']
        
        for c in droppedClfs:
            X_train= X_train.drop([c],axis = 1)
            X_test= X_test.drop([c],axis = 1)
            
        model = clf.fit(X_train, y_train)
        precision = precision_score(y_test,model.predict(X_test))
        recall = recall_score(y_test,model.predict(X_test))
        f1 = f1_score(y_test,model.predict(X_test))
        results.append([precision,recall,f1])
    
    return results

def labelFeatureStacking(df_all_folds,df_test_folds,droppedClfs):
    results=[]
    for fold in range(0,len(df_all_folds)):
        df_all = df_all_folds[fold]
        df_test = df_test_folds[fold]
        df_new=df_all.merge(df_test, how='left', indicator=True)
        df_train = df_new[df_new['_merge'] == 'left_only']
        df_train = df_train.drop(['_merge'],axis = 1)
        
        df_combined_train=pd.merge(df_train, df_vis_train, left_index=True, right_index=True)
        df_combined_train = df_combined_train.drop(['file_name'],axis = 1)
        df_combined_test=pd.merge(df_test, df_vis_train, left_index=True, right_index=True)
        df_combined_test = df_combined_test.drop(['file_name'],axis = 1)
        
        X_train = df_combined_train.loc[:, df_combined_train.columns != 'TARGET']
        y_train = df_combined_train['TARGET']
        
        X_test = df_combined_test.loc[:, df_combined_test.columns != 'TARGET']
        y_test = df_combined_test['TARGET']
        for c in droppedClfs:
            X_train= X_train.drop([c],axis = 1)
            X_test= X_test.drop([c],axis = 1)
        
                                  
        clf=SVC(gamma='auto')
        #clf=LogisticRegression()
        #clf=DecisionTreeClassifier()
        #clf=NearestCentroid()
        #clf=RandomForestClassifier(n_estimators=100)
        #clf=KNeighborsClassifier()
        #clf=AdaBoostClassifier()
        #clf=GradientBoostingClassifier()
                                  
        model = clf.fit(X_train, y_train)
        precision = precision_score(y_test,model.predict(X_test))
        recall = recall_score(y_test,model.predict(X_test))
        f1 = f1_score(y_test,model.predict(X_test))
        results.append([precision,recall,f1])
    

    return results
    
def calculateAverage(resMat):
    n=len(resMat)
    prec=0
    rec=0
    f1=0
    for i in range(0,n):
        prec=prec+resMat[i][0]
        rec=rec+resMat[i][1]
        f1=f1+resMat[i][2]
    prec=prec/n
    rec=rec/n
    f1=f1/n
    return [prec,rec,f1]


filtered_p = ow.df_score_stats['clf'][ow.df_score_stats['mean_precision']<0.5]
filtered_r = ow.df_score_stats['clf'][ow.df_score_stats['mean_recall']<0.5]
filtered_f = ow.df_score_stats['clf'][ow.df_score_stats['mean_f1']<0.5]
df_filtered =pd.concat([filtered_p,filtered_r],axis = 0)
df_filtered =pd.concat([df_filtered,filtered_f],axis=0)
droppedClfs=df_filtered.drop_duplicates()

res1=majorityVoting(ow.df_res_test,droppedClfs)
averages1=calculateAverage(res1)
print("-"*30)
print("Majority Voting:")
print("CV Scores: ",res1)
print("AVG: ",averages1)

res2=labelStacking(ow.df_res_all,ow.df_res_test,droppedClfs)
averages2=calculateAverage(res2)

print("-"*30)
print("Label Stacking:")
print("CV Scores: ",res2)
print("AVG: ",averages2)

res3=labelFeatureStacking(ow.df_res_all,ow.df_res_test,droppedClfs)
averages3=calculateAverage(res3)
                                  
print("-"*30)
print("Label-Feature Stacking:")
print("CV Scores: ",res3)
print("AVG: ",averages3)

ow.df_res_test[5]

------------------------------
Majority Voting:
CV Scores:  [[0.5555555555555556, 0.8333333333333334, 0.6666666666666667], [0.5555555555555556, 0.8333333333333334, 0.6666666666666667], [0.625, 1.0, 0.7692307692307693], [0.5, 0.8, 0.6153846153846154], [0.5714285714285714, 0.8, 0.6666666666666666], [0.5714285714285714, 0.8, 0.6666666666666666], [0.5555555555555556, 1.0, 0.7142857142857143], [0.625, 1.0, 0.7692307692307693], [0.42857142857142855, 0.6, 0.5], [0.5714285714285714, 0.8, 0.6666666666666666]]
AVG:  [0.555952380952381, 0.8466666666666667, 0.6701465201465202]
------------------------------
Label Stacking:
CV Scores:  [[0.5714285714285714, 0.6666666666666666, 0.6153846153846153], [0.5, 0.6666666666666666, 0.5714285714285715], [0.5714285714285714, 0.8, 0.6666666666666666], [0.625, 1.0, 0.7692307692307693], [0.6666666666666666, 0.8, 0.7272727272727272], [0.5, 0.6, 0.5454545454545454], [0.7142857142857143, 1.0, 0.8333333333333333], [0.625, 1.0, 0.7692307692307693], [0.6, 0.6, 0.6], [

Unnamed: 0,TARGET,Logistic regression_Audio,Gradient Boosting Tree_Audio,k-Nearest neighbors_Visual,Decision tree_Visual,Logistic regression_Visual,SVM (Gaussian Kernel)_Visual,Random Forest_Visual,AdaBoost_Visual,Gradient Boosting Tree_Visual,...,SVM (Gaussian Kernel)_Textual,k-Nearest neighbors_Metadata,Nearest mean classifier_Metadata,Decision tree_Metadata,Logistic regression_Metadata,SVM (Gaussian Kernel)_Metadata,Bagging_Metadata,Random Forest_Metadata,AdaBoost_Metadata,Gradient Boosting Tree_Metadata
49,1,0,0,0,1,1,1,1,1,1,...,1,0,0,0,0,0,0,0,0,0
50,1,1,0,1,0,0,1,0,0,0,...,1,0,1,0,1,0,1,1,1,1
52,1,1,1,1,0,1,1,1,1,1,...,1,0,1,0,1,1,1,1,0,1
53,0,1,0,0,1,0,1,0,0,1,...,1,1,1,0,1,1,1,1,1,1
54,1,1,0,1,1,1,1,1,0,1,...,1,1,0,0,0,1,0,0,0,0
55,0,1,0,1,0,0,1,1,1,1,...,1,1,1,1,0,1,0,0,0,0
56,0,1,0,1,1,0,1,1,1,0,...,1,0,1,0,1,1,1,1,1,1
57,0,0,1,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
59,1,1,1,1,0,0,1,0,1,0,...,1,0,0,0,0,0,0,0,0,0
