# Calculating ranking functions
For each model calculate average metrics for all the splits of each the model, or the ranking/ regret of the model when compared with the others in each split

In [1]:
import pandas as pd
import numpy as np
import json 
import os
from sklearn.metrics import precision_score, accuracy_score, recall_score

In [2]:
from IPython.display import display, HTML


In [3]:
# DataFrame.rolling -> simple moving average
# Weighted moving average sum(w*x) / sum(w)
# Exponential moving average
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html

#these are calculated for all splits of each model
def ranking_functions(df):
    x = df.precision_test
    average = x.mean()
    #-> alfa = 0.25,0.5,0.75
    try:
        mean_invstd_25  = 0.25*x.mean() - (1-0.25)*(x.std()) 
    except:
        mean_invstd_25  = 0.25*x.mean()
    try:
        mean_invstd_50  = 0.5*x.mean() - (1-0.5)*(x.std() )
    except:
        mean_invstd_50  = 0.5*x.mean()
    try:
        mean_invstd_75  = 0.75*x.mean() - (1-0.75)*(x.std())
    except:
        mean_invstd_75  = 0.75*x.mean() 
    try:
        invstdev = -(x.std())
    except:
        invstdev = 0

    w = [0,1,2,3,4,5,6,7,8,9]
    weighted_avg = sum(w*x) / sum(w)
    ewm = x.ewm(span=10).mean().mean()
    return pd.Series((average,mean_invstd_25,mean_invstd_50,mean_invstd_75,invstdev,weighted_avg),index=['average','average_neg_std_25','average_neg_std_50','average_neg_std_75','neg_stdev','weighted_avg'])


#The regret and rank must be calculated for each split of all models
#Rank: for the highest precision => rank =1. next => rank =2, etc.
#Regret:for the first split, who has the highest precision? (high_prec)
#for that, the regret is 0. for the next one is (prec_2 - high_prec), etc.
def ranking_rank(df):
    df = pd.pivot_table(df, values='precision_test', index=['model','param_config'], columns='time')
    #display(df)
    rank = df.rank()
    #display(rank)
    return  rank.T

def ranking_regret(df):
    df = pd.pivot_table(df, values='precision_test', index=['model','param_config'], columns='time')
    #display(df)
    highest_metric = df.max()
    
    regret_row = (df-highest_metric).T
    #display(regret_row)
    return  regret_row

In [4]:
outputs_metrics.model.value_counts()

NameError: name 'outputs_metrics' is not defined

In [5]:
outputs_metrics

NameError: name 'outputs_metrics' is not defined

In [9]:
outputs_metrics = pd.read_csv("data/outputs_metrics_refined.csv")
outputs_metrics['model_config'] = outputs_metrics['model'] + outputs_metrics['param_config'].apply(str)

bad_models = outputs_metrics.query('precision_test <0.05')['model_config'].drop_duplicates()
bad_models = list(bad_models)

outputs_metrics = outputs_metrics.query(f'model_config != {bad_models}')

outputs_metrics.model.value_counts()
outputs_metrics.sort_values('precision_test',ascending=False).iloc[:50]

Unnamed: 0.1,Unnamed: 0,threshold,precision_test,precision_threshold,filename,model,param_config,time,model_config
1361,636,0.522302,1.0,0.0,"(1, 5, 41)_MLPClassifier_date_params.csv",MLPClassifier_date_params,41,1,MLPClassifier_date_params41
3416,2291,0.634269,0.760234,0.745902,"(6, 3, 46)_lgb_LGBMClassifier_date_params.csv",lgb_LGBMClassifier_date_params,46,6,lgb_LGBMClassifier_date_params46
2600,210,0.734533,0.756152,0.647343,"(0, 4, 18)_XGBClassifier_date_params.csv",XGBClassifier_date_params,18,0,XGBClassifier_date_params18
3486,2298,0.522587,0.755162,0.7,"(6, 3, 8)_lgb_LGBMClassifier_date_params.csv",lgb_LGBMClassifier_date_params,8,6,lgb_LGBMClassifier_date_params8
3376,2287,0.616067,0.750751,0.716535,"(6, 3, 42)_lgb_LGBMClassifier_date_params.csv",lgb_LGBMClassifier_date_params,42,6,lgb_LGBMClassifier_date_params42
3446,2294,0.602861,0.747801,0.733871,"(6, 3, 49)_lgb_LGBMClassifier_date_params.csv",lgb_LGBMClassifier_date_params,49,6,lgb_LGBMClassifier_date_params49
3046,2254,0.610308,0.744318,0.722222,"(6, 3, 12)_lgb_LGBMClassifier_date_params.csv",lgb_LGBMClassifier_date_params,12,6,lgb_LGBMClassifier_date_params12
3066,2256,0.543982,0.742857,0.689394,"(6, 3, 14)_lgb_LGBMClassifier_date_params.csv",lgb_LGBMClassifier_date_params,14,6,lgb_LGBMClassifier_date_params14
3116,2261,0.617845,0.741935,0.733871,"(6, 3, 19)_lgb_LGBMClassifier_date_params.csv",lgb_LGBMClassifier_date_params,19,6,lgb_LGBMClassifier_date_params19
3276,2277,0.649324,0.741379,0.722222,"(6, 3, 33)_lgb_LGBMClassifier_date_params.csv",lgb_LGBMClassifier_date_params,33,6,lgb_LGBMClassifier_date_params33


In [10]:
pd.set_option('display.max_rows', 500)

ranking_regret_average = outputs_metrics.groupby(['time']).apply(ranking_regret).T.mean(axis=1)
ranking_regret_average.name = 'regret_average'
#print(ranking_regret_average)

ranking_regret_std = -outputs_metrics.groupby(['time']).apply(ranking_regret).T.std(axis=1)
ranking_regret_std.name = 'regret_neg_std'
#print(ranking_regret_std)

ranking_regret_invstd = 1/outputs_metrics.groupby(['time']).apply(ranking_regret).T.std(axis=1)

ranking_regret_invstd.name = 'regret_invstd'
#print(ranking_regret_std)



mean_invstd_regret_25  = 0.25*ranking_regret_average + (1-0.25)*(ranking_regret_std)
mean_invstd_regret_25.name = 'mean_neg_std_regret_25'
mean_invstd_regret_50  = 0.5*ranking_regret_average + (1-0.5)*(ranking_regret_std)
mean_invstd_regret_50.name = 'mean_neg_std_regret_50'
mean_invstd_regret_75  = 0.75*ranking_regret_average + (1-0.75)*(ranking_regret_std)
mean_invstd_regret_75.name = 'mean_neg_std_regret_75'

ranking_rank_average = outputs_metrics.groupby(['time']).apply(ranking_rank).T.mean(axis=1)
ranking_rank_average.name = 'rank_average'

ranking_rank_std = -outputs_metrics.groupby(['time']).apply(ranking_rank).T.std(axis=1)
ranking_rank_std.name = 'rank_neg_std'
print(ranking_rank_std)
ranking_rank_invstd = 1/outputs_metrics.groupby(['time']).apply(ranking_rank).T.std(axis=1)
ranking_rank_invstd.name = 'rank_invstd'

mean_invstd_rank_25  = 0.25*ranking_rank_average + (1-0.25)*(ranking_rank_std)
mean_invstd_rank_25.name = 'mean_neg_std_rank_25'

mean_invstd_rank_50  = 0.5*ranking_rank_average + (1-0.5)*(ranking_rank_std)
mean_invstd_rank_50.name = 'mean_neg_std_rank_50'

mean_invstd_rank_75  = 0.75*ranking_rank_average + (1-0.75)*(ranking_rank_std)
mean_invstd_rank_75.name = 'mean_neg_std_rank_75'


metrics_p_modelconifg = outputs_metrics.groupby(['model','param_config']).apply(ranking_functions)


model                               param_config
DecisionTreeClassifier_date_params  0              -21.819716
                                    2              -22.853154
                                    4              -30.070334
                                    6              -99.603045
                                    8              -30.842971
                                    10             -29.649808
                                    11             -42.778629
                                    16             -26.440919
                                    18             -25.929177
                                    20             -72.791197
                                    23             -28.455814
                                    24             -40.896753
                                    25             -31.975685
                                    26             -22.405357
                                    28             -28.276610
                     

In [11]:
ranking_metrics = pd.concat([metrics_p_modelconifg,\
                             ranking_regret_average,
                             ranking_regret_std,
                             mean_invstd_regret_25,
                            mean_invstd_regret_50,
                            mean_invstd_regret_75,
                             ranking_rank_average,
                            ranking_rank_std,
                            mean_invstd_rank_25,
                            mean_invstd_rank_50,
                            mean_invstd_rank_75], axis=1)\
                            .replace(to_replace=np.inf,value = 0,)
ranking_metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,average,average_neg_std_25,average_neg_std_50,average_neg_std_75,neg_stdev,weighted_avg,regret_average,regret_neg_std,mean_neg_std_regret_25,mean_neg_std_regret_50,mean_neg_std_regret_75,rank_average,rank_neg_std,mean_neg_std_rank_25,mean_neg_std_rank_50,mean_neg_std_rank_75
model,param_config,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
DecisionTreeClassifier_date_params,0,0.410483,0.07035,0.183728,0.297105,-0.043028,0.394516,-0.296833,-0.102475,-0.151065,-0.199654,-0.248243,33.9,-21.819716,-7.889787,6.040142,19.970071
DecisionTreeClassifier_date_params,2,0.440542,0.070704,0.193983,0.317263,-0.052575,0.420317,-0.266774,-0.096833,-0.139318,-0.181804,-0.224289,44.4,-22.853154,-6.039866,10.773423,27.586711
DecisionTreeClassifier_date_params,4,0.409048,0.051623,0.170765,0.289906,-0.067518,0.384375,-0.298268,-0.108031,-0.155591,-0.20315,-0.250709,35.65,-30.070334,-13.640251,2.789833,19.219916
DecisionTreeClassifier_date_params,6,0.445182,0.012366,0.156638,0.30091,-0.131906,0.391543,-0.262134,-0.136223,-0.167701,-0.199179,-0.230657,82.1,-99.603045,-54.177284,-8.751523,36.674239
DecisionTreeClassifier_date_params,8,0.47735,0.075258,0.209288,0.343319,-0.058773,0.459168,-0.229966,-0.090517,-0.12538,-0.160242,-0.195104,58.8,-30.842971,-8.432229,13.978514,36.389257
DecisionTreeClassifier_date_params,10,0.459319,0.040767,0.180285,0.319802,-0.09875,0.425378,-0.247997,-0.131586,-0.160688,-0.189791,-0.218894,46.0,-29.649808,-10.737356,8.175096,27.087548
DecisionTreeClassifier_date_params,11,0.460737,0.073154,0.202349,0.331543,-0.05604,0.428519,-0.246579,-0.10388,-0.139554,-0.175229,-0.210904,56.7,-42.778629,-17.908972,6.960685,31.830343
DecisionTreeClassifier_date_params,16,0.46899,0.077968,0.208309,0.338649,-0.052372,0.445899,-0.238326,-0.095022,-0.130848,-0.166674,-0.2025,56.7,-26.440919,-5.65569,15.12954,35.91477
DecisionTreeClassifier_date_params,18,0.418888,0.061367,0.180541,0.299715,-0.057806,0.399245,-0.288428,-0.093002,-0.141858,-0.190715,-0.239571,35.1,-25.929177,-10.671883,4.585411,19.842706
DecisionTreeClassifier_date_params,20,0.430241,0.051435,0.177704,0.303972,-0.074833,0.395264,-0.277075,-0.132764,-0.168842,-0.20492,-0.240997,58.15,-72.791197,-40.055898,-7.320598,25.414701


In [12]:
ranking_metrics.to_csv('ranking_metrics.csv')

In [13]:
best_config_p_metric =   ranking_metrics.idxmax()\
#    .groupby('model').apply(idxmax_param_config)

best_config_p_metric

average                            (XGBClassifier_date_params, 32)
average_neg_std_25                 (XGBClassifier_date_params, 32)
average_neg_std_50                 (XGBClassifier_date_params, 32)
average_neg_std_75                 (XGBClassifier_date_params, 32)
neg_stdev                 (DecisionTreeClassifier_date_params, 37)
weighted_avg                       (XGBClassifier_date_params, 35)
regret_average                     (XGBClassifier_date_params, 32)
regret_neg_std                     (MLPClassifier_date_params, 41)
mean_neg_std_regret_25             (MLPClassifier_date_params, 41)
mean_neg_std_regret_50        (lgb_LGBMClassifier_date_params, 39)
mean_neg_std_regret_75        (lgb_LGBMClassifier_date_params, 39)
rank_average                  (lgb_LGBMClassifier_date_params, 30)
rank_neg_std                                  (SVC_date_params, 0)
mean_neg_std_rank_25          (lgb_LGBMClassifier_date_params, 30)
mean_neg_std_rank_50          (lgb_LGBMClassifier_date_params,

In [14]:
model_series = []
params_series = []
for _, model_config in best_config_p_metric.iteritems():
    print(model_config)
    model = model_config[0]
    param_config =  model_config[1] 
    model_series.append(model)
    params_series.append(param_config)

best_config_p_metric = best_config_p_metric.to_frame().drop(0,axis=1)
best_config_p_metric['model'] = model_series
best_config_p_metric['param_config'] = params_series
best_config_p_metric

('XGBClassifier_date_params', 32)
('XGBClassifier_date_params', 32)
('XGBClassifier_date_params', 32)
('XGBClassifier_date_params', 32)
('DecisionTreeClassifier_date_params', 37)
('XGBClassifier_date_params', 35)
('XGBClassifier_date_params', 32)
('MLPClassifier_date_params', 41)
('MLPClassifier_date_params', 41)
('lgb_LGBMClassifier_date_params', 39)
('lgb_LGBMClassifier_date_params', 39)
('lgb_LGBMClassifier_date_params', 30)
('SVC_date_params', 0)
('lgb_LGBMClassifier_date_params', 30)
('lgb_LGBMClassifier_date_params', 30)
('lgb_LGBMClassifier_date_params', 30)


Unnamed: 0,model,param_config
average,XGBClassifier_date_params,32
average_neg_std_25,XGBClassifier_date_params,32
average_neg_std_50,XGBClassifier_date_params,32
average_neg_std_75,XGBClassifier_date_params,32
neg_stdev,DecisionTreeClassifier_date_params,37
weighted_avg,XGBClassifier_date_params,35
regret_average,XGBClassifier_date_params,32
regret_neg_std,MLPClassifier_date_params,41
mean_neg_std_regret_25,MLPClassifier_date_params,41
mean_neg_std_regret_50,lgb_LGBMClassifier_date_params,39


In [15]:
best_config_p_metric.to_csv('data/best_model_p_func.csv')

In [226]:
X_val.to_csv('data/X_validation.csv')
y_val.to_csv('data/y_validation.csv')

In [227]:
best_config_p_metric

average                    (XGBClassifier, 20)
average_std_25             (MLPClassifier, 17)
average_std_50             (MLPClassifier, 17)
average_std_75    (RandomForestClassifier, 25)
stdev                      (MLPClassifier, 16)
weighted_avg               (XGBClassifier, 20)
ewm_              (RandomForestClassifier, 37)
regret_average             (XGBClassifier, 20)
rank_average      (RandomForestClassifier, 34)
dtype: object

In [239]:
from sklearn.metrics import accuracy_score, recall_score

def find_threshold(y_true_th,y_proba_th, metric_1, metric_2, min_metric_2= 0.05,maximize_metric_2 = False):
    
    min_true_for_metric_1 = y_true_th.sum()*min_metric_2
    y_proba_cum = pd.DataFrame([y_proba_th,y_true_th],index=['y_proba_th','y_true_th']).T\
        .sort_values('y_proba_th',ascending = False)
    y_proba_cum['cumulative'] = y_proba_cum.y_true_th.cumsum()
    
    min_threshold = y_proba_cum.query(f'cumulative >= {min_true_for_metric_1}').iloc[0].y_proba_th
    metric = metric_1(y_true_th,y_proba_th>min_threshold)
    return min_threshold,metric
    
clfs = [NeuralNetClassifier,
         RandomForestClassifier,
        LogisticRegression,
       DecisionTreeClassifier,
        lgb.LGBMClassifier,
        XGBClassifier,
        MLPClassifier,
        SVC
       ]
clfs_names = ['NeuralNetClassifier',
         'RandomForestClassifier',
        'LogisticRegression',
       'DecisionTreeClassifier',
        "lgb_LGBMClassifier",
        'XGBClassifier',
        'MLPClassifier',
        'SVC'
       ]



clfs_names_dict = dict(zip(clfs_names,clfs))


def get_metrics_prod(file_test,model_name_test,y_true):
    #get any dict, they are all the same
    params_test = pd.read_csv(file_test).get_p.iloc[0].replace("nan", "None")
    


    #Create best model config acording to avg
    best_model_config_test = clfs_names_dict[model_name_test](**eval(params_test))
    
    #train on the whole train
    best_model_config_test = best_model_config_test.fit(X_train,y_train)
    y_prod_pred = best_model_config_test.predict_proba(X_test)[:,1]
    
    th,metric = find_threshold(y_true,y_prod_pred, precision_score, recall_score, min_metric_2= 0.05)
    
    print(np.array(y_prod_pred))
    print(np.array(y_true))
    return th, accuracy_score(y_true,y_prod_pred>th),best_model_config_test
    
def get_test_threshold(model,param_config):
    #get any filename to get the params
    file_test = 'outputs_models3/'
    data_model_config = outputs_metrics.query(f"model =='{model}' & param_config == {param_config}")
    print(data_model_config)
    file_test =file_test + data_model_config.filename.iloc[0]
    print(file_test)
    th,score_test,model = get_metrics_prod(file_test,model,y_test.values.astype(float))
    print("threshold test:",th)
    print("score test:",score_test)
    return(model,param_config,th,score_test,model)  

In [240]:
results = []

In [7]:
outputs_metrics.sort_values('metric',ascending = False)#.query('model == "MLPClassifier"')

Unnamed: 0.1,Unnamed: 0,threshold,metric,filename,model,param_config,time
2447,2794,0.367672,1.0,"(7, 6, 49)_SVC_date_params.csv",SVC_date_params,49,7
2137,2763,0.367641,1.0,"(7, 6, 20)_SVC_date_params.csv",SVC_date_params,20,7
1365,2036,0.444142,1.0,"(5, 5, 41)_MLPClassifier_date_params.csv",MLPClassifier_date_params,41,5
1322,982,0.271605,1.0,"(2, 5, 38)_MLPClassifier_date_params.csv",MLPClassifier_date_params,38,2
1321,632,0.265734,1.0,"(1, 5, 38)_MLPClassifier_date_params.csv",MLPClassifier_date_params,38,1
...,...,...,...,...,...,...,...
2125,2062,0.408383,0.0,"(5, 6, 2)_SVC_date_params.csv",SVC_date_params,2,5
2124,1712,0.374134,0.0,"(4, 6, 2)_SVC_date_params.csv",SVC_date_params,2,4
2123,1362,0.338837,0.0,"(3, 6, 2)_SVC_date_params.csv",SVC_date_params,2,3
2122,1012,0.271571,0.0,"(2, 6, 2)_SVC_date_params.csv",SVC_date_params,2,2


In [243]:
i=0
    
for metric,(model,param_config) in best_config_p_metric.iteritems():
    if i>=1:
        print(metric)
        print(model)
        print(config)
        results.append( get_test_threshold(model,config))
    i+=1

average_std_25
MLPClassifier
34
Empty DataFrame
Columns: [Unnamed: 0, threshold, metric, filename, model, param_config, time]
Index: []


IndexError: single positional indexer is out-of-bounds

In [195]:
i=0
for model,row in best_config_p_metric.iterrows():
    print(model)
    if i>=7:
        for metric_name in row.index:
            print(metric_name)
            results.append( get_test_threshold(model,metric_name))
    i+=1
    

AttributeError: 'Series' object has no attribute 'iterrows'

In [23]:
import pickle 
filename = 'results_best_models.pkl'
filehandler = open(filename, 'wb') 
pickle.dump(results, filehandler)

Unnamed: 0,project_features_entity_id_all_grade_level_Grades35_sum,project_features_entity_id_all_grade_level_Grades68_sum,project_features_entity_id_all_grade_level_Grades912_sum,project_features_entity_id_all_grade_level_GradesPreK2_sum,project_features_entity_id_all_grade_level__NULL_sum,project_features_entity_id_all_poverty_level__NULL_sum,project_features_entity_id_all_poverty_level_highpov_sum,project_features_entity_id_all_poverty_level_highest_sum,project_features_entity_id_all_poverty_level_lowpove_sum,project_features_entity_id_all_poverty_level_moderate_sum,...,donation_features_entity_id_all_teacher_funding_rate_2yr_sum,donation_features_entity_id_all_teacher_funding_rate_2yr_imp,donation_features_entity_id_all_zip_avg_donations_1yr_sum,donation_features_entity_id_all_zip_avg_donations_1yr_imp,donation_features_entity_id_all_zip_avg_donations_2yr_sum,donation_features_entity_id_all_zip_avg_donations_2yr_imp,donation_features_entity_id_all_zip_funding_rate_1yr_sum,donation_features_entity_id_all_zip_funding_rate_1yr_imp,donation_features_entity_id_all_zip_funding_rate_2yr_sum,donation_features_entity_id_all_zip_funding_rate_2yr_imp
98741,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,1.0,355.57000,0.0,251.75000,0.0,0.500000,0.0,0.500000,0.0
98742,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.818182,0.0,316.27290,0.0,318.64944,0.0,0.941860,0.0,0.899543,0.0
98743,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,548.72000,0.0,419.73572,0.0,0.500000,0.0,0.625000,0.0
98744,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,1.0,0.00000,1.0,141.05000,0.0,0.000000,0.0,0.400000,0.0
98745,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,1.0,0.00000,1.0,357.49500,0.0,0.000000,0.0,0.357143,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138543,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,1.0,89.25000,0.0,106.25000,0.0,0.000000,0.0,0.000000,0.0
138544,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,1.0,21.25000,0.0,717.56665,0.0,0.000000,0.0,0.400000,0.0
138545,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.666667,0.0,219.34000,0.0,229.29000,0.0,0.727273,0.0,0.666667,0.0
138546,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.500000,0.0,217.74342,0.0,238.70294,0.0,0.652174,0.0,0.523077,0.0
