In [60]:
import numpy as np
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from google.colab import drive
from google.colab import files
import os
import timeit
import pandas as pd
import random as rnd
import matplotlib.pyplot as plt

In [61]:
drive.mount('/content/drive')
complete_data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/train.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [62]:
seed = 1729
np.random.seed(seed)

Brief exploration of the dataset

In [63]:
complete_data.describe()

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,...,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,0.10049,10.679914,-1.627622,10.715192,6.796529,11.078333,-5.065317,5.408949,16.54585,0.284162,...,3.23444,7.438408,1.927839,3.331774,17.993784,-0.142088,2.303335,8.908158,15.87072,-3.326537
std,0.300653,3.040051,4.050044,2.640894,2.043319,1.62315,7.863267,0.866607,3.418076,3.332634,...,4.559922,3.023272,1.478423,3.99203,3.135162,1.429372,5.454369,0.921625,3.010945,10.438015
min,0.0,0.4084,-15.0434,2.1171,-0.0402,5.0748,-32.5626,2.3473,5.3497,-10.5055,...,-14.0933,-2.6917,-3.8145,-11.7834,8.6944,-5.261,-14.2096,5.9606,6.2993,-38.8528
25%,0.0,8.45385,-4.740025,8.722475,5.254075,9.883175,-11.20035,4.7677,13.9438,-2.3178,...,-0.058825,5.1574,0.889775,0.5846,15.6298,-1.1707,-1.946925,8.2528,13.8297,-11.208475
50%,0.0,10.52475,-1.60805,10.58,6.825,11.10825,-4.83315,5.3851,16.4568,0.3937,...,3.2036,7.34775,1.9013,3.39635,17.95795,-0.1727,2.4089,8.8882,15.93405,-2.81955
75%,0.0,12.7582,1.358625,12.5167,8.3241,12.261125,0.9248,6.003,19.1029,2.9379,...,6.4062,9.512525,2.9495,6.2058,20.396525,0.8296,6.556725,9.5933,18.064725,4.8368
max,1.0,20.315,10.3768,19.353,13.1883,16.6714,17.2516,8.4477,27.6918,10.1513,...,18.4409,16.7165,8.4024,18.2818,27.9288,4.2729,18.3215,12.0004,26.0791,28.5007


In [64]:
complete_data.columns

Index(['ID_code', 'target', 'var_0', 'var_1', 'var_2', 'var_3', 'var_4',
       'var_5', 'var_6', 'var_7',
       ...
       'var_190', 'var_191', 'var_192', 'var_193', 'var_194', 'var_195',
       'var_196', 'var_197', 'var_198', 'var_199'],
      dtype='object', length=202)

In [65]:
complete_var, complete_target = complete_data.drop(["target","ID_code"],axis=1).to_numpy(), complete_data.target.to_numpy()

In [66]:
X_train, X_test, y_train, y_test = train_test_split(complete_var,complete_target,test_size=0.1,random_state=seed)

# Evaluation of models
Exploration of combinations of models and hyperparameters that best perform on a subset of the dataset

In [67]:
X_train_sample,_,y_train_sample,_ = train_test_split(X_train,y_train,test_size=0.9,random_state=seed)

In [68]:
best_scores = []

### Model 1: Decision tree

In [69]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import make_classification

low_depths = np.random.randint(1,20,size=40)
lower_depths = np.random.randint(20,50,size = 10)
depths_list = np.concatenate((low_depths,lower_depths))
np.random.shuffle(depths_list)
param_dist = {
    'max_depth' : depths_list,
    'criterion' : ['gini','entropy','log_loss']
}

tree = DecisionTreeClassifier()
random_search_tree = RandomizedSearchCV(tree,param_distributions = param_dist,
                                        n_iter = 15, cv = 5, scoring = 'roc_auc', n_jobs=-1)
random_search_tree.fit(X_train_sample,y_train_sample)

In [70]:
random_search_tree.best_estimator_

In [71]:
cv_results_tree = random_search_tree.cv_results_
scoreFrameTree = pd.DataFrame(cv_results_tree)
best_tree = scoreFrameTree.loc[scoreFrameTree["rank_test_score"]==1].iloc[0]
best_tree = best_tree.to_frame().T
best_tree=best_tree.assign(Model='Decision Tree')
best_tree['algorithm'] = 'Decision Tree'
best_scores.append(best_tree)

In [72]:
best_scores.append(best_tree)

##Model 2: KNN

In [73]:
from sklearn.model_selection import RandomizedSearchCV,StratifiedKFold
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize

X_train_sample_norm = normalize(X_train_sample)
small_K = [i for i in range(3,10)]
mid_K = (np.random.randint(10,50,size=8))
big_K = (np.random.randint(51,100,size=5))
ks = np.concatenate((small_K, mid_K, big_K))
np.random.shuffle(ks)

param_dist = {
    'n_neighbors': [i for i in ks],
    'weights': ["uniform","distance"]
}

neighbors = KNeighborsClassifier()
random_search_knn = RandomizedSearchCV(neighbors, param_distributions = param_dist,
                                   n_iter = 15, cv=5, scoring = 'roc_auc', n_jobs=-1)
random_search_knn.fit(X_train_sample_norm,y_train_sample)
cv_results_knn = random_search_knn.cv_results_
scoreFrame_knn = pd.DataFrame(cv_results_knn)

In [74]:
best_knn = scoreFrame_knn.loc[scoreFrame_knn["rank_test_score"]==1].iloc[0]
best_knn = best_knn.to_frame().T
best_knn = best_knn.assign(Model = "KNN")
best_knn['algorithm'] = 'KNN'
best_scores.append(best_knn)

##Model 3: SVC

In [75]:
from sklearn.svm import SVC
common = {
    'C' : np.logspace(-2, 1, 3), #inverse of the importance of classifying wrong each datapoint
    'class_weight' : ['balanced'] + [{1:1.0, 0:x} for x in range(5,10)]
}

svm_param = [
  {
      'kernel': ['rbf', 'sigmoid', 'linear'],
  } |common,
  {
      "kernel": ['poly'],
      'degree': [3,4],
  }|common ]

svm_estimator = SVC()
rsc = RandomizedSearchCV(
    estimator = svm_estimator,
    param_distributions = svm_param,
    n_iter = 5, scoring = 'roc_auc', cv=3, n_jobs=-1
)

randomized_search_svc = rsc.fit(X_train_sample,y_train_sample)
cv_results_svc = rsc.cv_results_
scoreFrame_svc = pd.DataFrame(cv_results_svc)

In [76]:
best_svc = scoreFrame_svc.loc[scoreFrame_svc["rank_test_score"]==1].iloc[0]
best_svc = best_svc.to_frame().T
best_svc = best_svc.assign(Model = "Support Vector Machines")
best_svc['algorithm'] = "SVM"
best_scores.append(best_svc)

##Model 4: LDA

In [77]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
param_dist_lda = {
    'solver' : ['lsqr','eigen'],
    'tol' : np.logspace(0.01,0.5,50)
}

lda = LinearDiscriminantAnalysis()
random_search_lda = RandomizedSearchCV(lda,param_distributions=param_dist_lda,
                                       n_iter = 10, cv = 5, scoring = 'roc_auc',
                                       n_jobs=-1)
random_search_lda.fit(X_train_sample,y_train_sample)
cv_results_lda = random_search_lda.cv_results_
scoreFrameLDA = pd.DataFrame(cv_results_lda)

In [78]:
bestLDA = scoreFrameLDA.loc[scoreFrameLDA["rank_test_score"]==1].iloc[0]
bestLDA = bestLDA.to_frame().T
bestLDA = bestLDA.assign(Model = 'Linear Discriminant Analysis')
bestLDA['algorithm'] = 'LinearDiscriminantAnalysis'
best_scores.append(bestLDA)

##Model 5: Naive Bayes

In [79]:
from sklearn.naive_bayes import GaussianNB
param_dist_gnb = {
    'var_smoothing' : np.linspace(1e-9,1e-1,500)
}

gnb = GaussianNB()
random_search_gnb = RandomizedSearchCV(gnb,param_distributions = param_dist_gnb,
                                   n_iter = 10, cv = 5, scoring = 'roc_auc',
                                       n_jobs=-1)
random_search_gnb.fit(X_train_sample,y_train_sample)
cv_results_gnb = random_search_gnb.cv_results_
scoreFrameGNB = pd.DataFrame(cv_results_gnb)
scoreFrameGNB

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_var_smoothing,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.070604,0.003751,0.017575,0.000904,0.030461,{'var_smoothing': 0.030460922539078155},0.836883,0.832996,0.836329,0.820055,0.816599,0.828572,0.00854,2
1,0.065469,0.000806,0.018212,0.001463,0.09018,{'var_smoothing': 0.09018036081963927},0.811896,0.810745,0.813889,0.793361,0.79407,0.804792,0.009103,9
2,0.068208,0.004125,0.02144,0.007414,0.040481,{'var_smoothing': 0.04048096251903808},0.830753,0.827403,0.830938,0.813457,0.811068,0.822724,0.008667,3
3,0.075262,0.010347,0.017185,0.000647,0.047094,{'var_smoothing': 0.04709418890581162},0.827287,0.824428,0.828003,0.809718,0.807946,0.819477,0.008791,5
4,0.071191,0.005277,0.017931,0.000616,0.070741,{'var_smoothing': 0.07074148325851704},0.817528,0.815894,0.819291,0.799636,0.799341,0.810338,0.008924,7
5,0.067442,0.002201,0.017668,0.000391,0.091383,{'var_smoothing': 0.09138276561723446},0.811567,0.810453,0.813582,0.792982,0.793768,0.80447,0.009118,10
6,0.067457,0.003071,0.017787,0.000766,0.041884,{'var_smoothing': 0.04188376811623246},0.829947,0.826743,0.830247,0.812593,0.810389,0.821984,0.008683,4
7,0.069891,0.006437,0.018646,0.001291,0.061323,{'var_smoothing': 0.06132264567735471},0.820949,0.819018,0.822357,0.80325,0.802393,0.813593,0.008863,6
8,0.068023,0.002229,0.018243,0.00128,0.079559,{'var_smoothing': 0.07955911844088176},0.814739,0.813332,0.816656,0.796681,0.796793,0.80764,0.008965,8
9,0.064374,0.005748,0.016548,0.003154,0.016433,{'var_smoothing': 0.016432866567134267},0.848239,0.843279,0.846601,0.832373,0.827503,0.839599,0.008195,1


In [80]:
best_gnb = scoreFrameGNB.loc[scoreFrameGNB["rank_test_score"]==1].iloc[0]
best_gnb = best_gnb.to_frame().T
best_gnb = best_gnb.assign(Model="Gaussian NB", algorithm="Gaussian Naive Bayes")
best_scores.append(best_gnb)

In [81]:
best_scores_pd = pd.concat(best_scores)
best_scores_pd[['algorithm','params','mean_test_score','std_test_score']]

Unnamed: 0,algorithm,params,mean_test_score,std_test_score
8,Decision Tree,"{'max_depth': 5, 'criterion': 'log_loss'}",0.633114,0.008464
8,Decision Tree,"{'max_depth': 5, 'criterion': 'log_loss'}",0.633114,0.008464
2,KNN,"{'weights': 'distance', 'n_neighbors': 70}",0.618636,0.011138
2,SVM,"{'kernel': 'rbf', 'class_weight': {1: 1.0, 0: ...",0.794666,0.012332
0,LinearDiscriminantAnalysis,"{'tol': 1.7378008287493756, 'solver': 'eigen'}",0.849555,0.010788
9,Gaussian Naive Bayes,{'var_smoothing': 0.016432866567134267},0.839599,0.008195


The model with best performance was LDA in terms of AUCROC, altought Gaussian Naive Bayes got similar performance with much lower standard error between folds.

Because of this, the "winner" method is LDA, with the hyperparameters selected by the Randomized Search.

## Evaluation on held-out set

The best performative model will be trained on the entire training dataset, and then evaluated on the held-out data to determine best its performance on "never-seen" points

In [83]:
from sklearn import metrics
model = LinearDiscriminantAnalysis(tol=1.7378, solver='eigen')
model.fit(X_train,y_train)
y_eval_pred = model.predict(X_test)
aucroc_curve_pred = metrics.roc_auc_score(y_eval_pred,y_test)

In [84]:
aucroc_curve_pred

0.8153267918641384