In [1]:
%load_ext autoreload
%autoreload

In [2]:
import numpy as np
import pandas as pd
import time
import gc; gc.enable()
import time
import sys

pd.options.display.float_format = "{:.3f}".format
np.set_printoptions(precision=4)

import matplotlib.pyplot as plt
%matplotlib inline

sys.path.append('modules')

from shared_functions import *

import pickle
from datetime import datetime

  self.re = re.compile( self.reString )


In [3]:
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

In [4]:
def load_results(file_name):
    data = None
    with open(file_name, 'rb') as f:
        # The protocol version used is detected automatically, so we do not
        # have to specify it.
        data = pickle.load(f)
    return data

def filter_top_model_results(top_models, all_model_results):
    plot_data = list()
    for idx, row in top_models.iterrows():
        #print("row.label: " + row.label + ',' + row.classifier + ',' + row.sampling_method)
        for res in all_model_results:       
            for alg_results in res:
                if alg_results[0] == row.label and alg_results[1] == row.classifier and alg_results[2] == row.sampling_method:
                    #print(alg_results)
                    label, model_name, sampling_method, _, tpr, fpr, roc_auc, precision, recall, prc_auc = alg_results

                    plot_data.append(("{} :: {}".format(model_name, sampling_method), tpr, fpr, roc_auc, precision, recall, prc_auc))
    return plot_data

def rebuild_results(res):
    ## Gather all the metrics
    temp_metrics = pd.DataFrame()
    temp_results = list()
    for i in range(0, len(res)):
        temp_metrics = pd.concat([temp_metrics, res[i][1]])
        temp_results.append(res[i][2]) 
    
    return temp_metrics, temp_results

In [5]:
measurements = ['classifier', 'sampling_method', \
                #'fp', 'fn',\
                'balanced_accuracy', 'recall', 'precision', \
                 #'f1_score',\
                'train_time', 'aucroc', 'auprc',\
                'model_churn_cost']

measurements_alias = ['classifier', 'sampling', \
                      'bal_acc', 'recall', 'precision',\
                      # 'f1',\
                      'train_time', 'auroc', 'auprc',\
                      'churn_cost']

## Summarise performance of the approaches

In [6]:
## First Experiment:
## Approach 1.1 - Churn BasicData + Default Hyper

#exp1_1 = load_results('/home/dissertation/code/RESULTS_FINAL/Basic_DefaultHyper_2019-08-12.pickle')

#exp2_1 = load_results('/home/dissertation/code/RESULTS_FINAL/Manual_GridSearch_2019-08-10.pickle')

exp2_2 = load_results('/home/dissertation/code/RESULTS_FINAL/Manual_RdmSearch_2019-08-10.pickle')

#exp3_1 = load_results('/home/dissertation/code/RESULTS_FINAL/DFS_Default_2019-08-10.pickle')

#exp4_1 = load_results('/home/dissertation/code/RESULTS_FINAL/Basic_Asklearn_20190813.pickle')
                      
exp_metrics, exp_results = rebuild_results(exp2_2)

In [7]:
#exp_metrics[['classifier', 'sampling_method','accuracy']].groupby(['sampling_method']).count()

In [8]:
exp_metrics.head()

Unnamed: 0,label,classifier,sampling_method,tn,fn,tp,fp,accuracy,precision,recall,...,log_loss,train_time,cv_time,aucroc,auprc,balanced_accuracy,cv_score_mean,cv_score_std,model_churn_cost,sample
0,NB_random_5cv_recall,GaussianNB,,201877,10359,13702,31549,0.837,0.303,0.569,...,5.622,3.645,2.641,0.765,0.412,0.717,0.57,0.003,9704600,0
0,DT_random_5cv_recall,DecisionTreeClassifier,,191512,5517,18544,41914,0.816,0.307,0.771,...,6.362,2002.497,39.054,0.864,0.593,0.796,0.765,0.006,8804300,0
0,RF_random_5cv_recall,RandomForestClassifier,,200034,5976,18085,33392,0.847,0.351,0.752,...,5.281,1665.059,32.691,0.881,0.578,0.804,0.744,0.005,8135700,0
0,LR_random_5cv_recall,LogisticRegression,,134226,8164,15897,99200,0.583,0.138,0.661,...,14.402,178.124,2.52,0.683,0.32,0.618,0.65,0.004,15591700,0
0,MLP_random_5cv_recall,MLPClassifier,,231843,18226,5835,1583,0.923,0.787,0.243,...,2.657,2998.235,377.757,0.838,0.51,0.618,0.294,0.088,9854800,0


In [9]:
# Convert train_time
exp_metrics['train_time'] = (exp_metrics['train_time']/60)
exp_metrics['cv_time'] = (exp_metrics['cv_time']/60)

# Convert model_churn_cost to EUR from TWD
exp_metrics['model_churn_cost'] = np.round(exp_metrics['model_churn_cost'] * 0.029).astype(int)

In [10]:
measure_by = ['balanced_accuracy', 'recall', 'model_churn_cost']
measure_by_sort = [False, False, True]

# measure_by = ['balanced_accuracy']
# measure_by_sort = [False]

# measure_by = ['model_churn_cost']
# measure_by_sort = [True]

# measure_by = ['f1_score']
# measure_by_sort = [False]

### The best/worst 5 performing models as measured by .... 

In [11]:
print("Getting best results for {} / ordered ascending? {}".format(measure_by, measure_by_sort))

top_results = exp_metrics.sort_values(measure_by, ascending=measure_by_sort)
top_results = top_results[measurements].head(5)
top_results['classifier'] = top_results['classifier'].apply(lambda v: str(v).replace('Classifier', ''))

### The lowest 5 performing models by approach as measured by .... 
print("Getting worst results for {} / ordered ascending? {}".format(measure_by, [not f for f in measure_by_sort]))
worst_results = exp_metrics\
                .sort_values(measure_by, ascending=[not f for f in measure_by_sort])
worst_results = worst_results[measurements].head(5)
worst_results['classifier'] = worst_results['classifier'].apply(lambda v: str(v).replace('Classifier', ''))

temp = pd.concat([top_results, worst_results])
temp.columns = measurements_alias
temp

print(temp.to_latex(index=False))

Getting best results for ['balanced_accuracy', 'recall', 'model_churn_cost'] / ordered ascending? [False, False, True]
Getting worst results for ['balanced_accuracy', 'recall', 'model_churn_cost'] / ordered ascending? [True, True, False]
\begin{tabular}{llrrrrrrr}
\toprule
         classifier & sampling &  bal\_acc &  recall &  precision &  train\_time &  auroc &  auprc &  churn\_cost \\
\midrule
       RandomForest &  ROS 2:1 &    0.809 &   0.764 &      0.351 &      40.997 &  0.886 &  0.600 &      234120 \\
       RandomForest &  ROS 1:1 &    0.809 &   0.757 &      0.361 &      64.087 &  0.887 &  0.603 &      231295 \\
       RandomForest &  ROS 3:1 &    0.808 &   0.749 &      0.369 &      32.605 &  0.887 &  0.601 &      229219 \\
       RandomForest &  ROS 3:2 &    0.808 &   0.758 &      0.354 &      51.502 &  0.885 &  0.594 &      233595 \\
       RandomForest &  RUS 1:1 &    0.805 &   0.768 &      0.334 &       2.940 &  0.880 &  0.573 &      241619 \\
                XGB &     None

In [None]:
# exp_metrics[['label','classifier','sampling_method','accuracy','precision','recall','tp','tn','fp','fn']]

### Summary performance statistics of classifiers over all sampling techniques

In [None]:
## Excludes LR and MLP as those produce bad classifiers

stats_summary = {'best':max,
                 'median':'median', 
                 'iqr':lambda x: np.percentile(x, 75, interpolation='higher') - np.percentile(x, 25, interpolation='lower'),
                 #'range':lambda x: max(x) - min(x)
                }
#[~exp_metrics.recall.isin([0.0, 1.0])]
summary = \
    exp_metrics\
            .groupby(['classifier'])\
            .agg({
                 'balanced_accuracy':stats_summary,
                 'recall':stats_summary,
                 'precision':stats_summary,
                 'model_churn_cost':{'best':min},
                 #'aucroc':stats_summary,
                 #'auprc':stats_summary,
                 'train_time':'median',
                 })\
            .reset_index()\
            .sort_values(('balanced_accuracy', 'best'), ascending=False)
summary
#print(summary.to_latex(index=False))

### The best performing sampling methods by classifier as measured by .... 

In [None]:
print("Getting results for {} / ordered ascending? {}".format(measure_by, measure_by_sort))

top_results = exp_metrics.sort_values(measure_by, ascending=measure_by_sort).groupby('classifier').head(1)
top_results = top_results[measurements]
top_results['classifier'] = top_results['classifier'].apply(lambda v: str(v).replace('Classifier', ''))
top_results

### The best performing models by sampling method as measured by .... 

In [None]:
print("Getting results for {} / ordered ascending? {}".format(measure_by, measure_by_sort))
top_results = exp_metrics[(exp_metrics.recall > 0) & (exp_metrics.recall < 1)]\
                    .sort_values(measure_by, ascending=measure_by_sort).groupby('sampling_method').head(1)
top_results = top_results[measurements]
top_results.sort_values('balanced_accuracy', ascending=False)

### Classifier train time distribution by sampling method

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
plt.ylabel('Seconds')
exp_metrics.boxplot(column='train_time', by='sampling_method', ax=ax)
plt.title('Classifier Training Time Distribution\n', size=17)
plt.suptitle(' ')
plt.xlabel('Sampling Method')

### For this approach plot the best models by classifier

In [None]:
print("Getting best results by classifier based on {} / ordered ascending? {}".format(measure_by, measure_by_sort))
plot_results = exp_metrics.sort_values(measure_by, ascending=measure_by_sort).groupby('classifier').head(1)
plot_data = filter_top_model_results(plot_results, exp_results)
plot_roc_prc(plot_data, ' ')

### Choosing the best model and appending to overall .... 

In [None]:
exp1_1 = load_results('/home/dissertation/code/RESULTS_FINAL/Basic_DefaultHyper_2019-08-12.pickle')
exp_metrics, exp_results = rebuild_results(exp1_1)
manualgrid = load_results('Manual_GridSearch_2019-08-05.pickle')

manualrdm = load_results('Manual_RdmSearch_2019-08-05.pickle')

In [None]:
top_models = pd.DataFrame()

In [None]:
top_models = top_models.append(exp_metrics.sort_values(measure_by, ascending=measure_by_sort).head(1))

In [None]:
#filter_top_model_results(exp_metrics.sort_values(measure_by, ascending=measure_by_sort).head(1), exp_results)

In [None]:
from scipy import stats
import scikit_posthocs as sp

xgb = plot_data[0][-2]
rf = plot_data[1][-2]
dt = plot_data[2][-2]
gnb = plot_data[3][-2]
lr = plot_data[4][-2]
mlp = plot_data[5][-2]

data = [xgb, rf, dt, gnb, lr, mlp]

stats.kruskal(*data)

In [None]:
sp.posthoc_conover(data)

In [None]:
#all_metrics[all_metrics.recall < 1.0].sort_values(['recall'], ascending=[False])

In [None]:
#all_metrics.sort_values(['recall','precision'], ascending=[False, False])

In [None]:
print(top_results.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE

In [None]:
import scipy.stats as ss
import statsmodels.api as sa
import scikit_posthocs as sp
import statsmodels.formula.api as sfa

df = sa.datasets.get_rdataset('iris').data
data = [df.loc[ids, 'Sepal.Width'].values for ids in df.groupby('Species').groups.values()]

In [None]:
H, p = ss.kruskal(*data)
print(H, p)

In [None]:
sp.posthoc_conover(df, val_col='Sepal.Width', group_col='Species', p_adjust = 'holm')

In [None]:
df.columns = ['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Species']

In [None]:
lm = sfa.ols('Sepal_Width ~ C(Species)', data=df).fit()
anova = sa.stats.anova_lm(lm)
print(anova)

In [None]:
sp.posthoc_ttest(df, val_col='Sepal_Width', group_col='Species', p_adjust='holm')