## Imports and Setup

In [1]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd
import sklearn
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
%aimport mfc_video_utils
from mfc_video_utils import MfcVideoProcessor, BasicTransformer, compute_roc, save_object, load_object, grid_search_forest, grid_search_svm

pd.options.display.max_columns = 500
pd.options.display.max_rows = 2000
sns.set(rc={"figure.figsize": (12, 8)})
%matplotlib inline
%config InlineBackend.figure_format='retina'

## Load data

In [2]:
at_polimi = False
if at_polimi:
    datasets_location = Path("/nas/public/dataset/medifor/")
else:
    datasets_location = Path("/data1/dgueraco/datasets/")

In [3]:
# datasets = [nc2017_dev, mfc18_dev1, mfc18_dev2, mfc18_eval, mfc18_gan, mfc19_val]
if os.path.isfile("datasets.pkl"):
    nc2017_dev, mfc18_dev1, mfc18_dev2, mfc18_eval, mfc18_gan, mfc19_val = load_object("datasets.pkl")
else:
    nc2017_dev = MfcVideoProcessor("nc2017_dev", datasets_location / "NC2017_Dev_Ver1", ref_avail=True)     ## nc2017 - dev (training)
    mfc18_dev1 = MfcVideoProcessor("mfc18_dev1", datasets_location / "MFC18_Dev1_Video_Ver2", ref_avail=True) ## mfc18 - dev1 (training)
    mfc18_dev2 = MfcVideoProcessor("mfc18_dev2",  datasets_location / "MFC18_Dev2_Video_Ver3", ref_avail=True) ## mfc18 - dev2 (training)
    mfc18_eval = MfcVideoProcessor("mfc18_eval", datasets_location / "MFC18_EvalPart1_Video_Ver1", ref_avail=True, ref_folder=True) ## mfc18 - eval (training?)
    mfc18_gan = MfcVideoProcessor("mfc18_gan", datasets_location / "MFC18_Eval_GAN_Video_Ver3", ref_avail=True, ref_folder=True) # mfc18 - gan (training?)
    mfc19_val = MfcVideoProcessor("mfc19_val", datasets_location /  "MFC19_Video_Validation_Ver1", ref_avail=True) # mfc19 - val (test)
    datasets = [nc2017_dev, mfc18_dev1, mfc18_dev2, mfc18_eval, mfc18_gan, mfc19_val]
    save_object(datasets, "datasets.pkl")    

## SkLearn Magic

### References:
1. https://ramhiser.com/post/2018-04-16-building-scikit-learn-pipeline-with-pandas-dataframe/
2. https://medium.com/dunder-data/from-pandas-to-scikit-learn-a-new-exciting-workflow-e88e2271ef62
3. https://towardsdatascience.com/fine-tuning-a-classifier-in-scikit-learn-66e048c21e65
4. http://blog.kaggle.com/2016/07/21/approaching-almost-any-machine-learning-problem-abhishek-thakur/
5. https://ramhiser.com/post/2018-04-16-building-scikit-learn-pipeline-with-pandas-dataframe/

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, roc_auc_score, recall_score, accuracy_score, precision_score, f1_score, confusion_matrix
import random
import warnings
warnings.filterwarnings('ignore')
random_seed = 123
np.random.seed(random_seed)
random.seed(random_seed)

In [5]:
def test_datasets(pipe):
    l = []
    datasets = [nc2017_dev, mfc18_dev1, mfc18_dev2, mfc18_gan, mfc18_eval, mfc19_val]
    for dataset in datasets:
        l.append(compute_roc(dataset.ffprobe_df, dataset.labels, pipe, title=dataset.name, plot=False))
        print(dataset.name+':',l[-1])
    print('Average ROC AUC:', np.mean(l))

#### Experiments 08/03/19

#### Fair setup
1. Train: NC2017_DEV + MFC18_DEV1 + MFC18_DEV2 => Splits and select best
2. Test: MFC18_EVAL + MFC18_GAN + MFC19_EVAL

##### Conclusions:
MFC18_GAN videos are not properly detected, probably indicating that they are comming from a difference source

In [6]:
train_vids = pd.concat([nc2017_dev.ffprobe_df, mfc18_dev1.ffprobe_df, mfc18_dev2.ffprobe_df], sort=False)
train_labels = nc2017_dev.labels + mfc18_dev1.labels + mfc18_dev2.labels
test_vids = pd.concat([mfc18_eval.ffprobe_df, mfc18_gan.ffprobe_df, mfc19_val.ffprobe_df], sort=False)
test_labels = mfc18_eval.labels + mfc18_gan.labels + mfc19_val.labels

In [7]:
forest = RandomForestClassifier(max_depth=15, max_features='sqrt', min_samples_leaf=1,
                                 min_samples_split=2, n_estimators=120, random_state=random_seed,n_jobs=-1)
svm = SVC(C=150, class_weight='balanced', gamma='scale', random_state=random_seed, probability=True)

In [8]:
bt = mfc_video_utils.BasicTransformer(cat_threshold=1, num_strategy='mean', return_df=False)

In [9]:
ml_pipe_voting = Pipeline([('transformer', bt), 
                           ('ensembler', VotingClassifier(estimators=[('fr', forest), ('s', svm)], voting='soft', n_jobs=-1))])

In [10]:
ml_pipe_forest = Pipeline([('transformer', bt), 
                           ('forest', RandomForestClassifier(max_depth=15, max_features='sqrt', min_samples_leaf=1,
                                                             min_samples_split=2, n_estimators=120, random_state=random_seed, n_jobs=-1))])

In [11]:
ml_pipe_svm = Pipeline([('transformer', bt), 
                        ('SVM', SVC(C=150, class_weight='balanced', gamma='scale', random_state=random_seed, probability=True))])

In [12]:
ml_pipe_voting.fit(train_vids, train_labels)

Pipeline(memory=None,
     steps=[('transformer', BasicTransformer(cat_threshold=1, num_strategy='mean', return_df=False)), ('ensembler', VotingClassifier(estimators=[('fr', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='sqrt', max_leaf_nodes=None,
         ...=0.001, verbose=False))],
         flatten_transform=None, n_jobs=-1, voting='soft', weights=None))])

In [13]:
ml_pipe_forest.fit(train_vids, train_labels)

Pipeline(memory=None,
     steps=[('transformer', BasicTransformer(cat_threshold=1, num_strategy='mean', return_df=False)), ('forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impur...mators=120, n_jobs=-1,
            oob_score=False, random_state=123, verbose=0, warm_start=False))])

In [14]:
ml_pipe_svm.fit(train_vids, train_labels)

Pipeline(memory=None,
     steps=[('transformer', BasicTransformer(cat_threshold=1, num_strategy='mean', return_df=False)), ('SVM', SVC(C=150, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=True, random_state=123, shrinking=True,
  tol=0.001, verbose=False))])

In [15]:
test_datasets(ml_pipe_forest)

nc2017_dev: 1.0
mfc18_dev1: 1.0
mfc18_dev2: 1.0
mfc18_gan: 0.7852941176470588
mfc18_eval: 0.9947524739577679
mfc19_val: 0.9967948717948718
Average ROC AUC: 0.9628069105666164


In [16]:
test_datasets(ml_pipe_svm)

nc2017_dev: 1.0
mfc18_dev1: 1.0
mfc18_dev2: 1.0
mfc18_gan: 0.8135294117647058
mfc18_eval: 0.9978853577306024
mfc19_val: 0.985576923076923
Average ROC AUC: 0.966165282095372


In [17]:
test_datasets(ml_pipe_voting)

nc2017_dev: 1.0
mfc18_dev1: 1.0
mfc18_dev2: 1.0
mfc18_gan: 0.8385294117647059
mfc18_eval: 0.9974120599742075
mfc19_val: 0.9919871794871795
Average ROC AUC: 0.9713214418710154


In [18]:
save_object(ml_pipe_forest, 'best_forest_pipe_base.pkl')
save_object(ml_pipe_svm, 'best_svm_pipe_base.pkl')
save_object(ml_pipe_voting, 'best_voting_pipe_base.pkl')

In [None]:
gs_svm = grid_search_svm(train_vids, train_labels, test_vids, test_labels, ml_pipe_svm, StratifiedKFold(n_splits=10, shuffle=True))

In [None]:
gs_forest = grid_search_forest(train_vids, train_labels, test_vids, test_labels, ml_pipe_forest, StratifiedKFold(n_splits=10, shuffle=True))

#### Fair + GAN setup
1. Train: NC2017_DEV + MFC18_DEV1 + MFC18_DEV2 + MFC18_GAN => Splits and select best
2. Test: MFC18_EVAL + MFC19_EVAL

##### Conclusions:
Our performance in MFC18_GAN is obviously better but we lose performance in MFC18_EVAL + MFC19_EVAL

In [19]:
train_vids = pd.concat([nc2017_dev.ffprobe_df, mfc18_dev1.ffprobe_df, mfc18_dev2.ffprobe_df, mfc18_gan.ffprobe_df], sort=False)
train_labels = nc2017_dev.labels + mfc18_dev1.labels + mfc18_dev2.labels + mfc18_gan.labels
test_vids = pd.concat([mfc18_eval.ffprobe_df, mfc19_val.ffprobe_df], sort=False)
test_labels = mfc18_eval.labels + mfc19_val.labels

In [20]:
forest = RandomForestClassifier(max_depth=15, max_features='sqrt', min_samples_leaf=1,
                                 min_samples_split=2, n_estimators=120, random_state=random_seed,n_jobs=-1)
svm = SVC(C=150, class_weight='balanced', gamma='scale', random_state=random_seed, probability=True)

In [21]:
bt = mfc_video_utils.BasicTransformer(cat_threshold=10, num_strategy='median', return_df=False)

In [22]:
ml_pipe_voting = Pipeline([('transformer', bt), 
                           ('ensembler', VotingClassifier(estimators=[('fr', forest), ('s', svm)], voting='soft', n_jobs=-1))])

In [23]:
ml_pipe_forest = Pipeline([('transformer', bt), 
                           ('forest', RandomForestClassifier(max_depth=15, max_features='sqrt', min_samples_leaf=1,
                                                             min_samples_split=2, n_estimators=120, random_state=random_seed, n_jobs=-1))])

In [24]:
ml_pipe_svm = Pipeline([('transformer', bt), 
                        ('SVM', SVC(C=150, class_weight='balanced', gamma='scale', random_state=random_seed, probability=True))])

In [25]:
ml_pipe_voting.fit(train_vids, train_labels)

Pipeline(memory=None,
     steps=[('transformer', BasicTransformer(cat_threshold=10, num_strategy='median', return_df=False)), ('ensembler', VotingClassifier(estimators=[('fr', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='sqrt', max_leaf_nodes=None,
      ...=0.001, verbose=False))],
         flatten_transform=None, n_jobs=-1, voting='soft', weights=None))])

In [26]:
ml_pipe_forest.fit(train_vids, train_labels)

Pipeline(memory=None,
     steps=[('transformer', BasicTransformer(cat_threshold=10, num_strategy='median', return_df=False)), ('forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_im...mators=120, n_jobs=-1,
            oob_score=False, random_state=123, verbose=0, warm_start=False))])

In [27]:
ml_pipe_svm.fit(train_vids, train_labels)

Pipeline(memory=None,
     steps=[('transformer', BasicTransformer(cat_threshold=10, num_strategy='median', return_df=False)), ('SVM', SVC(C=150, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=True, random_state=123, shrinking=True,
  tol=0.001, verbose=False))])

In [28]:
test_datasets(ml_pipe_forest)

nc2017_dev: 1.0
mfc18_dev1: 1.0
mfc18_dev2: 1.0
mfc18_gan: 1.0
mfc18_eval: 0.9888449363653339
mfc19_val: 0.9967948717948718
Average ROC AUC: 0.9976066346933677


In [29]:
test_datasets(ml_pipe_svm)

nc2017_dev: 1.0
mfc18_dev1: 1.0
mfc18_dev2: 1.0
mfc18_gan: 0.9832352941176471
mfc18_eval: 0.9556229076114096
mfc19_val: 0.8766025641025641
Average ROC AUC: 0.9692434609719367


In [30]:
test_datasets(ml_pipe_voting)

nc2017_dev: 1.0
mfc18_dev1: 1.0
mfc18_dev2: 1.0
mfc18_gan: 1.0
mfc18_eval: 0.9947459606858909
mfc19_val: 0.9967948717948718
Average ROC AUC: 0.9985901387467937


In [31]:
save_object(ml_pipe_forest, 'best_forest_pipe_ganbase.pkl')
save_object(ml_pipe_svm, 'best_svm_pipe_ganbase.pkl')
save_object(ml_pipe_voting, 'best_voting_pipe_ganbase.pkl')

In [None]:
gs_forest = grid_search_forest(train_vids, train_labels, test_vids, test_labels, ml_pipe_forest, StratifiedKFold(n_splits=10, shuffle=True))

In [None]:
gs_svm_roc = grid_search_svm(train_vids, train_labels, test_vids, test_labels, ml_pipe_svm, StratifiedKFold(n_splits=10, shuffle=True), refit_score='roc_auc_score')

#### Unfair setup
1. Train: NC2017_DEV + MFC18_DEV1 + MFC18_DEV2 + MFC18_GAN + MFC18_EVAL + MFC19_EVAL => Splits and select best
2. Test: MFC18_EVAL + MFC19_EVAL

##### Conclusions:
Our performance in MFC18_GAN is obviously better but we lose performance in MFC18_EVAL + MFC19_EVAL

In [32]:
train_vids = pd.concat([nc2017_dev.ffprobe_df, mfc18_dev1.ffprobe_df, mfc18_dev2.ffprobe_df, mfc18_gan.ffprobe_df, mfc18_eval.ffprobe_df, mfc19_val.ffprobe_df], sort=False)
train_labels = nc2017_dev.labels + mfc18_dev1.labels + mfc18_dev2.labels + mfc18_gan.labels + mfc18_eval.labels + mfc19_val.labels
test_vids = pd.concat([mfc18_eval.ffprobe_df, mfc19_val.ffprobe_df], sort=False)
test_labels = mfc18_eval.labels + mfc19_val.labels

In [33]:
forest = RandomForestClassifier(max_depth=15, max_features='sqrt', min_samples_leaf=1,
                                 min_samples_split=2, n_estimators=120, random_state=random_seed,n_jobs=-1)
svm = SVC(C=150, class_weight='balanced', gamma='scale', random_state=random_seed, probability=True)

In [34]:
bt = mfc_video_utils.BasicTransformer(cat_threshold=1, num_strategy='mean', return_df=False)

In [35]:
ml_pipe_voting = Pipeline([('transformer', bt), 
                           ('ensembler', VotingClassifier(estimators=[('fr', forest), ('s', svm)], voting='soft', n_jobs=-1))])

In [36]:
ml_pipe_forest = Pipeline([('transformer', bt), 
                           ('forest', RandomForestClassifier(max_depth=15, max_features='sqrt', min_samples_leaf=1,
                                                             min_samples_split=2, n_estimators=120, random_state=random_seed, n_jobs=-1))])

In [37]:
ml_pipe_svm = Pipeline([('transformer', bt), 
                        ('SVM', SVC(C=150, class_weight='balanced', gamma='scale', random_state=random_seed, probability=True))])

In [38]:
ml_pipe_voting.fit(train_vids, train_labels)

Pipeline(memory=None,
     steps=[('transformer', BasicTransformer(cat_threshold=1, num_strategy='mean', return_df=False)), ('ensembler', VotingClassifier(estimators=[('fr', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='sqrt', max_leaf_nodes=None,
         ...=0.001, verbose=False))],
         flatten_transform=None, n_jobs=-1, voting='soft', weights=None))])

In [39]:
ml_pipe_forest.fit(train_vids, train_labels)

Pipeline(memory=None,
     steps=[('transformer', BasicTransformer(cat_threshold=1, num_strategy='mean', return_df=False)), ('forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impur...mators=120, n_jobs=-1,
            oob_score=False, random_state=123, verbose=0, warm_start=False))])

In [40]:
ml_pipe_svm.fit(train_vids, train_labels)

Pipeline(memory=None,
     steps=[('transformer', BasicTransformer(cat_threshold=1, num_strategy='mean', return_df=False)), ('SVM', SVC(C=150, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=True, random_state=123, shrinking=True,
  tol=0.001, verbose=False))])

In [41]:
test_datasets(ml_pipe_forest)

nc2017_dev: 1.0
mfc18_dev1: 1.0
mfc18_dev2: 1.0
mfc18_gan: 1.0
mfc18_eval: 1.0
mfc19_val: 1.0
Average ROC AUC: 1.0


In [42]:
test_datasets(ml_pipe_svm)

nc2017_dev: 1.0
mfc18_dev1: 1.0
mfc18_dev2: 1.0
mfc18_gan: 0.9997058823529412
mfc18_eval: 1.0
mfc19_val: 1.0
Average ROC AUC: 0.9999509803921569


In [43]:
test_datasets(ml_pipe_voting)

nc2017_dev: 1.0
mfc18_dev1: 1.0
mfc18_dev2: 1.0
mfc18_gan: 1.0
mfc18_eval: 1.0
mfc19_val: 1.0
Average ROC AUC: 1.0


In [44]:
save_object(ml_pipe_forest, 'best_forest_pipe_full.pkl')
save_object(ml_pipe_svm, 'best_svm_pipe_full.pkl')
save_object(ml_pipe_voting, 'best_voting_pipe_full.pkl')

In [None]:
gs_svm = grid_search_svm(train_vids, train_labels, test_vids, test_labels, ml_pipe_svm, StratifiedKFold(n_splits=10, shuffle=True), refit_score='roc_auc_score')

In [None]:
gs_svm.best_estimator_.predict_proba(test_vids)[:,1]

### Test Single Video Processor

In [None]:
voting_clf = load_object('best_voting_pipe_ganbase.pkl')

In [None]:
mfc19_processor = mfc_video_utils.MfcEvalVideoProcessor(voting_clf)

In [None]:
mfc19_val.csv

In [None]:
mfc19_processor(mfc19_val.get_video_path(13))

### Old Experiments

In [None]:
top_k = 10
basic_pipe.named_steps['bt'].get_feature_names()[np.argsort(-basic_pipe.named_steps['tree'].feature_importances_)][1:top_k]