In [2]:
# import stuff

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier



In [3]:
# explore data

train_data = pd.read_csv('data/train.csv')
train_labels = pd.read_csv('data/trainLabels.csv')
test_data = pd.read_csv('data/test.csv')

train_data.describe()

Unnamed: 0,0.29940251144353242,-1.2266241875260637,1.4984250500215328,-1.1761503610375272,5.2898525545597037,0.20829711393323402,2.4044983672405826,1.5945062220589785,-0.051608163273514231,0.66323431039687908,...,-0.85046544625016463,-0.62298999638261954,-1.8330573433160038,0.29302438506869571,3.5526813410266507,0.71761099417552265,3.3059719748508889,-2.7155588147154619,-2.6824085866346223,0.10105047232890663
count,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,...,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0
mean,0.025322,-0.023323,-0.025612,-0.001096,1.088127,-0.006465,0.495433,-0.039517,0.026469,-0.004264,...,0.031533,0.023598,-0.541199,-0.011913,-0.487548,0.032686,0.564443,0.009574,-0.890867,0.60996
std,1.00875,1.016094,0.978412,0.970349,4.539161,0.989601,2.11902,2.232776,1.001562,1.013808,...,1.011767,1.001668,2.240688,1.022922,2.118491,1.007316,2.227304,0.994404,2.02224,2.0464
min,-3.365711,-3.492086,-2.695602,-3.460471,-16.421901,-3.04125,-7.224761,-6.509084,-3.145588,-2.749812,...,-3.379194,-2.971125,-7.84089,-2.999564,-7.124105,-2.952358,-5.452254,-3.473913,-8.051722,-7.799086
25%,-0.670358,-0.690859,-0.700048,-0.616548,-1.805683,-0.733114,-0.839542,-1.60701,-0.680114,-0.682246,...,-0.659295,-0.696257,-2.123069,-0.664966,-1.88004,-0.643513,-1.060154,-0.68989,-2.214672,-0.565156
50%,0.023121,-0.031181,0.008037,0.003735,0.858932,0.025803,0.571475,0.017761,0.022855,-0.037531,...,0.049792,0.050187,-0.568168,-0.028179,-0.493701,0.036975,0.453544,0.038464,-0.853259,0.780175
75%,0.76298,0.683464,0.657948,0.640968,3.836911,0.671494,1.912284,1.436715,0.741949,0.666416,...,0.747815,0.700093,0.940334,0.651692,0.997741,0.690139,2.120006,0.693603,0.390982,1.994308
max,3.326246,3.58387,2.546507,3.088738,17.565345,3.102997,7.592666,7.130097,3.145258,3.919426,...,2.844792,3.688047,7.160379,3.353631,6.005818,3.420561,6.603499,3.492548,5.77412,6.803984


In [4]:
# prepare data

X = train_data
y = train_labels
train_X, val_X, train_y, val_y = train_test_split(X, y)

In [5]:
def evaluate(model, y_train_pred, y_val_pred):
    print('-------------'+ str(model)+'-------------')
    print('\n*** Cross Val Accuracy ***\n')
    print(cross_val_score(model, train_data, np.ravel(train_labels), cv=10).mean())

    print('\n*** Train Test Metrics ***\n')
    print('Train Accuracy score:', accuracy_score(train_y,y_train_pred))
    print('Val Accuracy score:', accuracy_score(val_y,y_val_pred))
    print('Train Precision score:', precision_score(train_y,y_train_pred))
    print('Val Precision score:', precision_score(val_y,y_val_pred))
    print('Train Recall score:', recall_score(train_y,y_train_pred))
    print('Val Recall score:', recall_score(val_y,y_val_pred))
    print('Train f1 score:', f1_score(train_y,y_train_pred))
    print('Val f1 score:', f1_score(val_y,y_val_pred))
    print('Train roc auc score:', roc_auc_score(train_y,y_train_pred))
    print('Val roc auc score:', roc_auc_score(val_y,y_val_pred))
    print('Train confusion matrix:\n', confusion_matrix(train_y,y_train_pred))
    print('Val confusion matrix:\n', confusion_matrix(val_y,y_val_pred))

In [6]:
# initialize models

dt_model = DecisionTreeRegressor(random_state = 1)
knn_model = KNeighborsClassifier(n_neighbors=3)
svc_model = SVC()
rf_model = RandomForestClassifier()
mlp_model = MLPClassifier(hidden_layer_sizes=(35, 35))



In [7]:
# fit models

dt_model.fit(train_X, train_y)
knn_model.fit(train_X, np.ravel(train_y))
svc_model.fit(train_X, np.ravel(train_y))
rf_model.fit(train_X, np.ravel(train_y))
mlp_model.fit(train_X, np.ravel(train_y))

MLPClassifier(hidden_layer_sizes=(35, 35))

In [8]:
# predict

dt_model_val_pred = dt_model.predict(val_X)
dt_model_train_pred = dt_model.predict(train_X)

knn_model_val_pred = knn_model.predict(val_X)
knn_model_train_pred = knn_model.predict(train_X)

svc_model_val_pred = svc_model.predict(val_X)
svc_model_train_pred = svc_model.predict(train_X)

rf_model_val_pred = rf_model.predict(val_X)
rf_model_train_pred = rf_model.predict(train_X)

mlp_model_val_pred = mlp_model.predict(val_X)
mlp_model_train_pred = mlp_model.predict(train_X)

In [9]:
# evaluate

evaluate(dt_model, dt_model_train_pred, dt_model_val_pred)
evaluate(knn_model, knn_model_train_pred, knn_model_val_pred)
evaluate(svc_model, svc_model_train_pred, svc_model_val_pred)
evaluate(rf_model, rf_model_train_pred, rf_model_val_pred)
evaluate(mlp_model, mlp_model_train_pred, mlp_model_val_pred)

-------------DecisionTreeRegressor(random_state=1)-------------

*** Cross Val Accuracy ***

0.03708071274523476

*** Train Test Metrics ***

Train Accuracy score: 1.0
Val Accuracy score: 0.804
Train Precision score: 1.0
Val Precision score: 0.7966101694915254
Train Recall score: 1.0
Val Recall score: 0.7899159663865546
Train f1 score: 1.0
Val f1 score: 0.7932489451476793
Train roc auc score: 1.0
Val roc auc score: 0.8033549297581628
Train confusion matrix:
 [[359   0]
 [  0 390]]
Val confusion matrix:
 [[107  24]
 [ 25  94]]
-------------KNeighborsClassifier(n_neighbors=3)-------------

*** Cross Val Accuracy ***

0.9108383838383839

*** Train Test Metrics ***

Train Accuracy score: 0.9546061415220294
Val Accuracy score: 0.904
Train Precision score: 0.9540816326530612
Val Precision score: 0.8861788617886179
Train Recall score: 0.958974358974359
Val Recall score: 0.9159663865546218
Train f1 score: 0.9565217391304348
Val f1 score: 0.9008264462809917
Train roc auc score: 0.95441754160417

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
gb_model = GradientBoostingClassifier()
gb_model.fit(train_X, np.ravel(train_y))
gb_model_val_pred = gb_model.predict(val_X)
gb_model_train_pred = gb_model.predict(train_X)
evaluate(gb_model, gb_model_train_pred, gb_model_val_pred)

-------------GradientBoostingClassifier()-------------

*** Cross Val Accuracy ***

0.8698282828282828

*** Train Test Metrics ***

Train Accuracy score: 0.9959946595460614
Val Accuracy score: 0.864
Train Precision score: 0.9974293059125964
Val Precision score: 0.8571428571428571
Train Recall score: 0.9948717948717949
Val Recall score: 0.8571428571428571
Train f1 score: 0.9961489088575095
Val f1 score: 0.8571428571428571
Train roc auc score: 0.9960431397757303
Val roc auc score: 0.8636859323882224
Train confusion matrix:
 [[358   1]
 [  2 388]]
Val confusion matrix:
 [[114  17]
 [ 17 102]]


In [11]:
from sklearn.naive_bayes import GaussianNB
gnb_model = GaussianNB()
gnb_model.fit(train_X, np.ravel(train_y))
gnb_model_val_pred = gnb_model.predict(val_X)
gnb_model_train_pred = gnb_model.predict(train_X)
evaluate(gnb_model, gnb_model_train_pred, gnb_model_val_pred)

-------------GaussianNB()-------------

*** Cross Val Accuracy ***

0.8168080808080809

*** Train Test Metrics ***

Train Accuracy score: 0.8317757009345794
Val Accuracy score: 0.808
Train Precision score: 0.84375
Val Precision score: 0.7751937984496124
Train Recall score: 0.8307692307692308
Val Recall score: 0.8403361344537815
Train f1 score: 0.8372093023255814
Val f1 score: 0.8064516129032258
Train roc auc score: 0.8318191557745875
Val roc auc score: 0.8094810443261273
Train confusion matrix:
 [[299  60]
 [ 66 324]]
Val confusion matrix:
 [[102  29]
 [ 19 100]]


In [13]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier()
xgb_model.fit(train_X, np.ravel(train_y))
xgb_model_val_pred = xgb_model.predict(val_X)
xgb_model_train_pred = xgb_model.predict(train_X)
evaluate(xgb_model, xgb_model_train_pred, xgb_model_val_pred)

-------------XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)-------------

*** Cross Val Accuracy ***

0.8778686868686869

*** Train Test Metrics ***

Train Accuracy score: 1.0
Val Accuracy score: 0.868
Train Precision score: 1.0
Val Precision score: 0.8467741935483871
Train Recall score: 1.0
Val Recall score: 0.8823529411764706
Train f1 score: 1.0
Val f1 score: 0.8641975308641976
Train roc auc score: 1.0
Val roc auc score: 0.8686573866187696
Train confusion mat