In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import RandomizedSearchCV
from sklearn import tree
import shap
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, auc, precision_score, recall_score, precision_recall_curve

In [2]:
df = pd.read_csv('df_total_sp.csv')
df = df.drop(columns = ['SG_UF','NU_IDADE_N', 'FEBRE', 'NAUSEA', 'DOR_COSTAS', 'CEFALEIA', 'LACO', 'DOR_RETRO'])

In [3]:
df

Unnamed: 0,CRITERIO,MIALGIA,EXANTEMA,VOMITO,CONJUNTVIT,ARTRITE,ARTRALGIA,PETEQUIA_N,LEUCOPENIA,CHIK
0,EpiClinico,0,0,0,0,0,0,0,0,0
1,Laboratorial,0,0,0,0,0,0,0,0,0
2,EpiClinico,1,0,0,0,0,0,0,0,0
3,EpiClinico,1,0,0,0,1,1,0,0,0
4,Laboratorial,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
1761395,Laboratorial,1,1,1,0,1,0,0,0,1
1761396,Laboratorial,1,1,1,0,0,0,0,0,1
1761397,Laboratorial,1,0,1,0,0,0,0,0,1
1761398,Laboratorial,1,0,0,0,0,1,0,0,1


In [4]:
df_lab = df[df['CRITERIO'] == 'Laboratorial']
df_lab = df_lab.reset_index(drop = True)
df_epi = df[df['CRITERIO'] == 'EpiClinico'] 
df_epi = df_epi.reset_index(drop = True)
df_lab = df_lab.drop(columns = ['CRITERIO'])
df_epi = df_epi.drop(columns = ['CRITERIO'])

In [5]:
df_lab.shape

(857416, 9)

In [6]:
df_lab = df_lab.drop_duplicates()

In [7]:
df_lab.shape

(407, 9)

In [8]:
df_lab.columns

Index(['MIALGIA', 'EXANTEMA', 'VOMITO', 'CONJUNTVIT', 'ARTRITE', 'ARTRALGIA',
       'PETEQUIA_N', 'LEUCOPENIA', 'CHIK'],
      dtype='object')

In [9]:
df_lab.loc[df_lab['MIALGIA'] == 1, 'MIALGIA'] = 1
#df_lab.loc[df_lab['CEFALEIA'] == 1, 'CEFALEIA'] = 1
df_lab.loc[df_lab['EXANTEMA'] == 1, 'EXANTEMA'] = 1
df_lab.loc[df_lab['VOMITO'] == 1, 'VOMITO'] = 1
df_lab.loc[df_lab['CONJUNTVIT'] == 1, 'CONJUNTVIT'] = 1
df_lab.loc[df_lab['ARTRITE'] == 1, 'ARTRITE'] = 2
df_lab.loc[df_lab['ARTRALGIA'] == 1, 'ARTRALGIA'] = 3
df_lab.loc[df_lab['PETEQUIA_N'] == 1, 'PETEQUIA_N'] = 2
df_lab.loc[df_lab['LEUCOPENIA'] == 1, 'LEUCOPENIA'] = 3
#df_lab.loc[df_lab['LACO'] == 1, 'LACO'] = 1
#df_lab.loc[df_lab['DOR_RETRO'] == 1, 'DOR_RETRO'] = 1

In [10]:
df_lab

Unnamed: 0,MIALGIA,EXANTEMA,VOMITO,CONJUNTVIT,ARTRITE,ARTRALGIA,PETEQUIA_N,LEUCOPENIA,CHIK
0,0,0,0,0,0,0,0,0,0
2,2,0,2,0,2,3,0,0,0
3,2,0,0,0,0,0,0,0,0
5,2,0,2,0,0,0,0,0,0
6,2,0,0,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
856570,2,2,2,0,0,0,0,3,1
856590,2,0,0,0,2,3,2,3,1
856965,2,0,0,0,0,3,2,3,1
856971,0,0,0,0,0,0,2,3,1


### Train-test split

In [13]:
X = df_lab.iloc[:,0:8]
y = df_lab.iloc[:,8]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = 0.6, random_state = 0)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, stratify = y_test, test_size = 0.5, random_state = 0)

In [14]:
print('Train: ', y_train.shape, sum(y_train == 1), sum(y_train == 0))
print('Test: ', y_test.shape, sum(y_test == 1), sum(y_test == 0))
print('Validation: ', y_val.shape, sum(y_val == 1), sum(y_val == 0))

Train:  (162,) 60 102
Test:  (122,) 45 77
Validation:  (123,) 46 77


### Random Forest Tuning - Random Search

In [106]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 1500, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [1, 2, 5, 10, 15]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10, 15]
# Balanced weights
class_types = [None, 'balanced']

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': ['sqrt'],
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf
               }

In [107]:
rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(),
                               param_distributions = random_grid,
                               n_iter = 100,
                               scoring = 'roc_auc',
                               cv = 5, 
                               verbose = 3,
                               random_state=0,
                               n_jobs = -1)

In [None]:
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
rf_random.best_estimator_

In [None]:
rf_random.best_score_

### XGBoost Tuning - Random Search

In [15]:
max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
min_child_weight = [1, 5, 10, 50, 100, 200]
subsample = [0, 0.25, 0.5, 0.75, 1]
learning_rate = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]

# Create the random grid
random_grid = {'max_depth': max_depth,
               'min_child_weight': min_child_weight,
               'subsample': subsample,
               'learning_rate': learning_rate
               }

In [16]:
rf_random = RandomizedSearchCV(estimator = XGBClassifier(nthread = 8),
                               param_distributions = random_grid,
                               n_iter = 100,
                               scoring = 'roc_auc',
                               cv = 5, 
                               verbose = 3,
                               random_state=0,
                               n_jobs = -1)

In [17]:
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits



2 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\denis\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\denis\Anaconda3\lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
  File "C:\Users\denis\Anaconda3\lib\site-packages\xgboost\sklearn.py", line 1512, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "C:\Users\denis\Anaconda3\lib\site-packages\xgboost\sklearn.py", line 596, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "C

In [20]:
rf_random.best_estimator_

In [21]:
rf_random.best_score_

0.6293650793650795

### Logistic regression - Randomized Search CV

In [41]:
penalty = ['l2', 'l1']
C = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.75, 1]
class_weight = [None, 'balanced']

# Create the random grid
random_grid = {'penalty': penalty,
               'C': C,
               'class_weight': class_weight
               }

In [42]:
rf_random = RandomizedSearchCV(estimator = LogisticRegression(solver = 'liblinear'),
                               param_distributions = random_grid,
                               n_iter = 500,
                               scoring = 'roc_auc',
                               cv = 10, 
                               verbose = 3,
                               random_state=0,
                               n_jobs = -1)

In [43]:
rf_random.fit(X_train, y_train)

The total space of parameters 32 is smaller than n_iter=500. Running 32 iterations. For exhaustive searches, use GridSearchCV.


Fitting 10 folds for each of 32 candidates, totalling 320 fits


In [44]:
rf_random.best_score_

0.7728438846437669

In [45]:
rf_random.best_estimator_