In [1]:
import pandas as pd
import numpy as np
import json 
import os
from sklearn.metrics import precision_score, accuracy_score, recall_score

In [2]:
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

from torch import nn
import torch.nn.functional as F
from skorch import NeuralNetClassifier

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
#from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

class MyModule(nn.Module):
    def __init__(self, num_units=10, nonlin=F.relu):
        super(MyModule, self).__init__()

        self.dense0 = nn.Linear(111, num_units)
        self.nonlin = nonlin
        self.dropout = nn.Dropout(0.5)
        self.dense1 = nn.Linear(num_units, 10)
        self.output = nn.Linear(10, 2)

    def forward(self, X, **kwargs):
        X = self.nonlin(self.dense0(X.float()))
        X = self.dropout(X)
        X = F.relu(self.dense1(X))
        X = F.softma
        return X

In [3]:
# DataFrame.rolling -> simple moving average
# Weighted moving average sum(w*x) / sum(w)
# Exponential moving average
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html

def ranking_functions(df):
    x = df.metric
    average = x.mean()
    w = [0,1,2,3,4,5,6,7,8,9]
    weighted_avg = sum(w*x) / sum(w)
    ewm = x.ewm(span=10).mean().mean()
    return pd.Series((average,weighted_avg,ewm),index=['average_','weighted_avg','ewm_'])
    
    

In [4]:
def idxmax_param_config(df):
    ix_average = df.average_.idxmax()
    ix_weighted_avg = df.weighted_avg.idxmax()
    ix_ewm = df.ewm_.idxmax()

    return pd.Series((df.loc[ix_average].param_config,df.loc[ix_weighted_avg].param_config,df.loc[ix_ewm].param_config)\
                         ,index=['average_','weighted_avg','ewm_'])
    

In [5]:
def fpr(y_true, y_pred):
    tnr = recall_score(y_true, y_pred, pos_label = 0) 
    fpr = 1 - tnr
    return fpr

In [6]:
# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')

In [7]:
def recall_s(y_true, y_pred):
    tnr = recall_score(y_true, y_pred, pos_label = 1)
    return tnr

In [8]:
def find_threshold(y_true_th,y_proba_th, metric_1, metric_2, min_metric_2= 0.4):
    
    thresholds = np.arange(0, 1, 0.001)

    # evaluate each threshold. We are trying to optimize the first metric, guaranteeing a maximum of 0.4 of the second
    metric_1 = recall_s
    metric_2 = fpr

    scores = np.array([[metric_1(y_true_th, to_labels(y_proba_th, t)) , metric_2(y_true_th, to_labels(y_proba_th, t))]\
              for t in thresholds])
    scores_guaranteed_min = scores[:,0]*(scores[:,1]<=min_metric_2)
    #print(scores_guaranteed_min)
    #print(scores_guaranteed_min)
    ix = np.argmax(scores_guaranteed_min)
    th = thresholds[ix]
    metric = scores_guaranteed_min[ix]
    print('Threshold=%.3f, Metric=%.5f' % (th, metric))
    
    #plt.plot(thresholds, scores[:,0])
    #plt.plot(thresholds, scores[:,1])
    #plt.show()
    
    return th,metric

#data_test = data.query('date > Start_Test')


In [9]:


clfs = [NeuralNetClassifier,
         RandomForestClassifier,
        LogisticRegression,
       DecisionTreeClassifier,
        lgb.LGBMClassifier,
        XGBClassifier,
        MLPClassifier,
        SVC
       ]
clfs_names = ['NeuralNetClassifier',
         'RandomForestClassifier',
        'LogisticRegression',
       'DecisionTreeClassifier',
        "lgb_LGBMClassifier",
        'XGBClassifier',
        'MLPClassifier',
        'SVC'
       ]

clfs_names_dict = dict(zip(clfs_names,clfs))

In [77]:
from sklearn.metrics import accuracy_score
def get_metrics_prod(file_test,model_name_test,clfs_names_dict,y_true):
    #get any dict, they are all the same
    params_test = pd.read_csv(file_test).get_p.iloc[0]
    #Create best model config acording to avg
    best_model_config_test = clfs_names_dict[model_name_test](**eval(params_test))
    
    #train on the whole train
    best_model_config_test = best_model_config_test.fit(X_train,y_train)
    y_prod_pred = best_model_config_test.predict_proba(X_test)[:,1]
    
    th,metric = find_threshold(y_true,y_prod_pred, precision_score, accuracy_score, min_metric_2= 0.4)
    
    print(np.array(y_prod_pred))
    print(np.array(y_true))
    return th, accuracy_score(y_true,y_prod_pred>th),best_model_config_test
    
    

In [81]:
def get_test_threshold(model_name_test,metric):
    #one example
    best_model_config_average = best_config_p_metric.loc[model_name_test][metric]
    #get any filename to get the params
    file_test = 'outputs_models2/'+outputs_metrics.query(f"model =='{model_name_test}' & param_config == {best_model_config_average}").filename.iloc[0]
    print(file_test)
    th,score_test,model = get_metrics_prod(file_test,model_name_test,clfs_names_dict,y_test.values.astype(float))
    print("threshold test:",th)
    print("score test:",score_test)
    return(model_name_test,metric,th,score_test,model)

In [11]:
outputs_metrics = pd.read_csv("data/outputs_metrics.csv")


In [12]:
metrics_p_modelconifg = outputs_metrics.groupby(['model','param_config']).apply(ranking_functions)\
    
best_config_p_metric =   metrics_p_modelconifg.reset_index()\
    .groupby('model').apply(idxmax_param_config)



best_config_p_metric

Unnamed: 0_level_0,average_,weighted_avg,ewm_
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DecisionTreeClassifier,37,41,37
LogisticRegression,41,41,41
MLPClassifier,19,19,19
NeuralNetClassifier,8,8,8
RandomForestClassifier,37,34,37
SVC,3,3,28
XGBClassifier,20,20,20
lgb_LGBMClassifier,26,26,26


In [13]:
file = 'data/donors_choose_prepared.csv'
fold_data = pd.read_csv(file)
fold_data['as_of_date'] = pd.to_datetime(fold_data.as_of_date)
start_production = fold_data.as_of_date.max() - pd.DateOffset(months=2)
start_thresh = fold_data.as_of_date.max() - pd.DateOffset(months=3)
start_train = fold_data.as_of_date.max() - pd.DateOffset(months=6)


In [14]:
X_train = fold_data.query(f'as_of_date > "{start_train}" & as_of_date <= "{start_thresh}" ')\
            .drop(['entity_id','as_of_date','quickstart_label'],axis=1).applymap(float)
y_train = fold_data.query(f'as_of_date > "{start_train}" & as_of_date <= "{start_thresh}" ')\
        ['quickstart_label'].apply(int)

X_test = fold_data.query(f'as_of_date > "{start_thresh}" & as_of_date <= "{start_production}" ')\
        .drop(['entity_id','as_of_date','quickstart_label'],axis=1).applymap(float)
y_test = fold_data.query(f'as_of_date > "{start_thresh}" & as_of_date <= "{start_production}" ')\
        ['quickstart_label'].apply(int)

X_val = fold_data.query(f'as_of_date > "{start_production}"  ')\
        .drop(['entity_id','as_of_date','quickstart_label'],axis=1).applymap(float)
y_val = fold_data.query(f'as_of_date > "{start_production}" ')\
        ['quickstart_label'].apply(int)

In [17]:
best_config_p_metric

Unnamed: 0_level_0,average_,weighted_avg,ewm_
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DecisionTreeClassifier,37,41,37
LogisticRegression,41,41,41
MLPClassifier,19,19,19
NeuralNetClassifier,8,8,8
RandomForestClassifier,37,34,37
SVC,3,3,28
XGBClassifier,20,20,20
lgb_LGBMClassifier,26,26,26


In [None]:
results = []
for model,row in best_config_p_metric.iterrows():
    print(model)
    for metric_name in row.index:
        print(metric_name)
        results.append( get_test_threshold(model,metric_name))
    

DecisionTreeClassifier
average_
outputs_models2/(0, 3, 37)_DecisionTreeClassifier.csv
Threshold=0.304, Metric=0.53211
[0.25       0.25748503 0.29942418 ... 0.39473684 1.         0.43382353]
[1. 0. 0. ... 0. 0. 0.]
threshold test: 0.304
score test: 0.5777317452097359
weighted_avg
outputs_models2/(0, 3, 41)_DecisionTreeClassifier.csv
Threshold=0.313, Metric=0.49694
[0.13333333 0.27642276 0.44863014 ... 0.26211454 0.50877193 0.23009624]
[1. 0. 0. ... 0. 0. 0.]
threshold test: 0.313
score test: 0.5820818228896945
ewm_
outputs_models2/(0, 3, 37)_DecisionTreeClassifier.csv
Threshold=0.313, Metric=0.49847
[0.5        0.29782834 0.34591195 ... 0.26530612 0.35678392 0.43434343]
[1. 0. 0. ... 0. 0. 0.]
threshold test: 0.313
score test: 0.5732780942516831
LogisticRegression
average_
outputs_models2/(0, 2, 41)_LogisticRegression.csv
Threshold=0.316, Metric=0.53517
[0.40513144 0.35021677 0.25730056 ... 0.30044321 0.31186976 0.23265742]
[1. 0. 0. ... 0. 0. 0.]
threshold test: 0.316
score test: 0.580

In [72]:
results

[('DecisionTreeClassifier', 'average_', 0.307, 0.5650958052822371),
 ('DecisionTreeClassifier', 'weighted_avg', 0.314, 0.5782496116002072),
 ('DecisionTreeClassifier', 'ewm_', 0.319, 0.5742102537545314)]