# Binary Classification of Dengue/Chikungunya Symptoms with Logistic Regression

- Simple algorithm, a test run of a possible classification project of dengue/chikungunya based on the different symptoms and combinations of symptoms of each disease. Decision Tree in this case.
- Data on Dengue and Chikungunya from SINAN-2024


In [1]:
# standard libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# ML libraries and functions
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RepeatedStratifiedKFold, StratifiedKFold
import graphviz 
from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve, roc_auc_score, precision_score, recall_score, precision_recall_curve
from sklearn.metrics import f1_score

In [2]:
# Loading the data 

df_dengue = pd.read_csv('dengue_SP.csv')
df_chik = pd.read_csv('chik_SP.csv')

In [3]:
# Visualizing the data, making sure of the possible values
df_dengue.describe()

Unnamed: 0,CRITERIO,FEBRE,MIALGIA,CEFALEIA,EXANTEMA,VOMITO,NAUSEA,DOR_COSTAS,CONJUNTVIT,ARTRITE,ARTRALGIA,PETEQUIA_N,LEUCOPENIA,LACO,DOR_RETRO
count,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0
mean,1.525112,1.135946,1.220604,1.206035,1.9095,1.740793,1.564364,1.669362,1.962446,1.903149,1.870194,1.941132,1.957934,1.960215,1.698508
std,0.5130969,0.3427308,0.4146539,0.404456,0.2868966,0.4381995,0.4958401,0.4704431,0.1901151,0.2957543,0.3360898,0.2353768,0.2007392,0.1954531,0.4589061
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0
50%,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
75%,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
max,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [5]:
# Removing the cases under investigation
df_dengue = df_dengue[df_dengue['CRITERIO'] != 3]
df_chik = df_chik[df_chik['CRITERIO'] != 3]

In [6]:
print(df_dengue.shape)
print(df_chik.shape)

(1754905, 15)
(6495, 15)


In [7]:
# Separating the laboratorial vs the clinic/epidemiologic criteria

df_dengue_lab = df_dengue[df_dengue['CRITERIO'] == 1]
df_chik_lab = df_chik[df_chik['CRITERIO'] == 1]

df_dengue_epi = df_dengue[df_dengue['CRITERIO'] == 2]
df_chik_epi = df_chik[df_chik['CRITERIO'] == 2]

In [8]:
# Doing some extra pre-processing

## removing the criteria
df_dengue_lab = df_dengue_lab.drop(columns = ['CRITERIO'])
df_chik_lab = df_chik_lab.drop(columns = ['CRITERIO'])
df_dengue_epi = df_dengue_epi.drop(columns = ['CRITERIO'])
df_chik_epi = df_chik_epi.drop(columns = ['CRITERIO'])

## replacing all 2 with 0 in the symptoms
df_dengue_lab = df_dengue_lab.replace(2,0)
df_chik_lab = df_chik_lab.replace(2,0)
df_dengue_epi = df_dengue_epi.replace(2,0)
df_chik_epi = df_chik_epi.replace(2,0)

In [9]:
# Defining a new variable indicating for dengue
df_dengue_lab['CHIK'] = 0
df_dengue_epi['CHIK'] = 0
df_chik_lab['CHIK'] = 1
df_chik_epi['CHIK'] = 1

In [10]:
# Defining the full laboratorial dataset
df_lab = pd.concat([df_dengue_lab,df_chik_lab])

# Defining the full epidemiological dataset
df_epi = pd.concat([df_dengue_epi,df_chik_epi])

## Defining Train and Test Datasets

In [11]:
# Defining target and features
X = df_lab.drop(columns = ['CHIK'])
y = df_lab['CHIK']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = 0.3)

In [13]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(600191, 14) (600191,)
(257225, 14) (257225,)


## Training Random Forest

### Simple Random Forest

In [14]:
rfc = RandomForestClassifier(random_state = 13, max_depth = 3)
rfc.fit(X_train ,y_train)
y_pred = rfc.predict(X_test)
y_pred_train = rfc.predict(X_train)

In [15]:
print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')

Accuracy Score: 0.9930955389250656
Confusion Matrix: 
[[255449      0]
 [  1776      0]]
Area Under Curve: 0.5
Recall score: 0.0


### Weighted Random Forest

In [16]:
weights = {0: 1, 1: 150}

rfc = RandomForestClassifier(random_state = 13, max_depth = 3, n_estimators = 200, class_weight = weights)
rfc.fit(X_train ,y_train)
y_pred = rfc.predict(X_test)
y_pred_train = rfc.predict(X_train)

In [17]:
print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')

Accuracy Score: 0.8000583147050248
Confusion Matrix: 
[[204670  50779]
 [   651   1125]]
Area Under Curve: 0.7173313135810787
Recall score: 0.6334459459459459


### Optimization Random  Forest

In [19]:
weights = [{0: 1, 1: 150}, {0:1, 1:140}, {0:1, 1: 120}]
n_estimators_range = np.array([100, 200, 500])
max_depth_range = np.array([2, 3, 4])
hyperparam_grid = {"class_weight": weights, "n_estimators": n_estimators_range, "max_depth": max_depth_range}

rfc = RandomForestClassifier(random_state = 13)
grid = GridSearchCV(rfc,hyperparam_grid,scoring="roc_auc", n_jobs=1, refit=True)
grid.fit(X_train,y_train)

In [20]:
print(f'Best score: {grid.best_score_} with param: {grid.best_params_}')

Best score: 0.7688745981791011 with param: {'class_weight': {0: 1, 1: 120}, 'max_depth': 4, 'n_estimators': 200}


In [21]:
y_pred = grid.predict(X_test)
# performance
print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')

Accuracy Score: 0.825981144912042
Confusion Matrix: 
[[211379  44070]
 [   692   1084]]
Area Under Curve: 0.7189203005173119
Recall score: 0.6103603603603603


## Doing a counterfactual of all cases

### Number of chik and dengue - total

In [22]:
X_lab = df_lab.drop(columns = ['CHIK'])
X_epi = df_epi.drop(columns = ['CHIK'])
y_lab = df_lab['CHIK']
y_epi = df_epi['CHIK']

In [23]:
print('Numbers from SINAN')
print('Dengue total - total: ', np.sum(y_lab == 0) + np.sum(y_epi == 0))
print('Chik total - total: ', np.sum(y_lab == 1) + np.sum(y_epi == 1))

Numbers from SINAN
Dengue total - total:  1754905
Chik total - total:  6495


In [24]:
y_pred_lab = grid.predict(X_lab) 
y_pred_epi = grid.predict(X_epi)

In [25]:
print('Numbers from Model')
print('Dengue total - total: ', np.sum(y_pred_lab == 0) + np.sum(y_pred_epi == 0))
print('Chik total - total: ', np.sum(y_pred_lab == 1) + np.sum(y_pred_epi == 1))

Numbers from Model
Dengue total - total:  1449411
Chik total - total:  311989
