In [72]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import RandomizedSearchCV
from sklearn import tree
import shap
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, auc, precision_score, recall_score, precision_recall_curve

In [73]:
df = pd.read_csv('df_total_sp.csv')
df = df.drop(columns = ['SG_UF','NU_IDADE_N'])

In [74]:
df

Unnamed: 0,CRITERIO,FEBRE,MIALGIA,CEFALEIA,EXANTEMA,VOMITO,NAUSEA,DOR_COSTAS,CONJUNTVIT,ARTRITE,ARTRALGIA,PETEQUIA_N,LEUCOPENIA,LACO,DOR_RETRO,CHIK
0,EpiClinico,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,Laboratorial,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0
2,EpiClinico,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0
3,EpiClinico,1,1,1,0,0,0,1,0,1,1,0,0,0,1,0
4,Laboratorial,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1761395,Laboratorial,1,1,1,1,1,1,1,0,1,0,0,0,0,1,1
1761396,Laboratorial,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1
1761397,Laboratorial,1,1,1,0,1,1,0,0,0,0,0,0,0,0,1
1761398,Laboratorial,1,1,1,0,0,1,1,0,0,1,0,0,0,1,1


In [75]:
df_lab = df[df['CRITERIO'] == 'Laboratorial']
df_lab = df_lab.reset_index(drop = True)
df_epi = df[df['CRITERIO'] == 'EpiClinico'] 
df_epi = df_epi.reset_index(drop = True)
df_lab = df_lab.drop(columns = ['CRITERIO'])
df_epi = df_epi.drop(columns = ['CRITERIO'])

In [76]:
df_lab.shape

(857416, 15)

In [77]:
df_lab = df_lab.drop_duplicates()

In [78]:
df_lab.shape

(8436, 15)

### Train-test split

In [79]:
X = df_lab.iloc[:,0:13]
y = df_lab.iloc[:,14]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = 0.6, random_state = 0)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, stratify = y_test, test_size = 0.5, random_state = 0)

In [80]:
print('Train: ', y_train.shape, sum(y_train == 1), sum(y_train == 0))
print('Test: ', y_test.shape, sum(y_test == 1), sum(y_test == 0))
print('Validation: ', y_val.shape, sum(y_val == 1), sum(y_val == 0))

Train:  (3374,) 490 2884
Test:  (2531,) 368 2163
Validation:  (2531,) 368 2163


### Random Forest Tuning - Random Search

In [34]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 20, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [1, 2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]
# Balanced weights
class_types = [None, 'balanced']

# Create the random grid
random_grid = {'n_estimators': n_estimators,
                'criterion': ['gini', 'entropy', 'log_loss'],
               'max_features': ['sqrt'],
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'class_weight': class_types
               }

In [35]:
rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(),
                               param_distributions = random_grid,
                               n_iter = 100,
                               scoring = 'roc_auc',
                               cv = 5, 
                               verbose = 3,
                               random_state=0,
                               n_jobs = -1)

In [36]:
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [37]:
rf_random.best_estimator_

In [39]:
rf_random.best_score_

0.7925993205630029

### XGBoost Tuning - Random Search

In [66]:
max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
min_child_weight = [1, 5, 10, 50, 100, 200]
subsample = [0, 0.25, 0.5, 0.75, 1]
learning_rate = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]

# Create the random grid
random_grid = {'max_depth': max_depth,
               'min_child_weight': min_child_weight,
               'subsample': subsample,
               'learning_rate': learning_rate
               }

In [67]:
rf_random = RandomizedSearchCV(estimator = XGBClassifier(nthread = 8),
                               param_distributions = random_grid,
                               n_iter = 500,
                               scoring = 'roc_auc',
                               cv = 5, 
                               verbose = 3,
                               random_state=0,
                               n_jobs = -1)

In [68]:
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


In [69]:
rf_random.best_estimator_

In [71]:
rf_random.best_score_

0.8025365471867545

### Logistic regression - Randomized Search CV

In [112]:
penalty = ['l2', 'l1']
C = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.75, 1]
class_weight = [None, 'balanced']

# Create the random grid
random_grid = {'penalty': penalty,
               'C': C,
               'class_weight': class_weight
               }

In [113]:
rf_random = RandomizedSearchCV(estimator = LogisticRegression(solver = 'liblinear'),
                               param_distributions = random_grid,
                               n_iter = 500,
                               scoring = 'roc_auc',
                               cv = 10, 
                               verbose = 3,
                               random_state=0,
                               n_jobs = -1)

In [114]:
rf_random.fit(X_train, y_train)

The total space of parameters 32 is smaller than n_iter=500. Running 32 iterations. For exhaustive searches, use GridSearchCV.


Fitting 10 folds for each of 32 candidates, totalling 320 fits


In [116]:
rf_random.best_score_

0.7815944637658985

In [117]:
rf_random.best_estimator_