# Binary Classification of Dengue/Chikungunya Symptoms with Logistic Regression

- Simple algorithm, a test run of a possible classification project of dengue/chikungunya based on the different symptoms and combinations of symptoms of each disease. Decision Tree in this case.
- Data on Dengue and Chikungunya from SINAN-2024


In [1]:
# standard libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# ML libraries and functions
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RepeatedStratifiedKFold, StratifiedKFold
import graphviz 
from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve, roc_auc_score, precision_score, recall_score, precision_recall_curve
from sklearn.metrics import f1_score

In [2]:
# Loading the data 

df_dengue = pd.read_csv('dengue_SP.csv')
df_chik = pd.read_csv('chik_SP.csv')

In [3]:
# Visualizing the data, making sure of the possible values
df_dengue.describe()

Unnamed: 0,CRITERIO,FEBRE,MIALGIA,CEFALEIA,EXANTEMA,VOMITO,NAUSEA,DOR_COSTAS,CONJUNTVIT,ARTRITE,ARTRALGIA,PETEQUIA_N,LEUCOPENIA,LACO,DOR_RETRO
count,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0,1767186.0
mean,1.525112,1.135946,1.220604,1.206035,1.9095,1.740793,1.564364,1.669362,1.962446,1.903149,1.870194,1.941132,1.957934,1.960215,1.698508
std,0.5130969,0.3427308,0.4146539,0.404456,0.2868966,0.4381995,0.4958401,0.4704431,0.1901151,0.2957543,0.3360898,0.2353768,0.2007392,0.1954531,0.4589061
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0
50%,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
75%,2.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
max,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [4]:
# Removing the cases under investigation
df_dengue = df_dengue[df_dengue['CRITERIO'] != 3]
df_chik = df_chik[df_chik['CRITERIO'] != 3]

In [5]:
print(df_dengue.shape)
print(df_chik.shape)

(1754905, 15)
(6495, 15)


In [6]:
# Separating the laboratorial vs the clinic/epidemiologic criteria

df_dengue_lab = df_dengue[df_dengue['CRITERIO'] == 1]
df_chik_lab = df_chik[df_chik['CRITERIO'] == 1]

df_dengue_epi = df_dengue[df_dengue['CRITERIO'] == 2]
df_chik_epi = df_chik[df_chik['CRITERIO'] == 2]

In [7]:
# Doing some extra pre-processing

## removing the criteria
df_dengue_lab = df_dengue_lab.drop(columns = ['CRITERIO'])
df_chik_lab = df_chik_lab.drop(columns = ['CRITERIO'])
df_dengue_epi = df_dengue_epi.drop(columns = ['CRITERIO'])
df_chik_epi = df_chik_epi.drop(columns = ['CRITERIO'])

## replacing all 2 with 0 in the symptoms
df_dengue_lab = df_dengue_lab.replace(2,0)
df_chik_lab = df_chik_lab.replace(2,0)
df_dengue_epi = df_dengue_epi.replace(2,0)
df_chik_epi = df_chik_epi.replace(2,0)

In [8]:
# Defining a new variable indicating for dengue
df_dengue_lab['CHIK'] = 0
df_dengue_epi['CHIK'] = 0
df_chik_lab['CHIK'] = 1
df_chik_epi['CHIK'] = 1

In [9]:
# Defining the full laboratorial dataset
df_lab = pd.concat([df_dengue_lab,df_chik_lab])

# Defining the full epidemiological dataset
df_epi = pd.concat([df_dengue_epi,df_chik_epi])

In [10]:
df_lab = df_lab[df_lab['DOR_COSTAS'] == 1]

In [11]:
df_lab = df_lab.sort_values(by = ['FEBRE', 'MIALGIA', 'CEFALEIA', 'EXANTEMA', 'VOMITO', 'NAUSEA',
       'DOR_COSTAS', 'CONJUNTVIT', 'ARTRITE', 'ARTRALGIA', 'PETEQUIA_N',
       'LEUCOPENIA', 'LACO', 'DOR_RETRO'])

## Defining Train and Test Datasets

In [12]:
# Defining target and features
X = df_lab.drop(columns = ['CHIK'])
y = df_lab['CHIK']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = 0.3)

In [14]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(209605, 14) (209605,)
(89831, 14) (89831,)


## Training XGBoost Classifier

### Simple XGBoost

In [15]:
clx = XGBClassifier()
clx.fit(X_train, y_train)

In [16]:
y_pred = clx.predict(X_test)
y_pred_train = clx.predict(X_train)

In [17]:
print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')

Accuracy Score: 0.9909274081330498
Confusion Matrix: 
[[89015     1]
 [  814     1]]
Area Under Curve: 0.5006078799647792
Recall score: 0.001226993865030675


### Weighted XGBoost (basic)

In [18]:
clx = XGBClassifier(scale_pos_weight=150, max_depth = 2, learning_rate = 0.01)
clx.fit(X_train, y_train)

In [19]:
y_pred = clx.predict(X_test)
y_pred_train = clx.predict(X_train)

In [20]:
print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')

Accuracy Score: 0.6619318498068596
Confusion Matrix: 
[[58888 30128]
 [  241   574]]
Area Under Curve: 0.6829192353094584
Recall score: 0.7042944785276074


## Doing a counterfactual of all cases

### Number of chik and dengue - total

In [21]:
X_lab = df_lab.drop(columns = ['CHIK'])
X_epi = df_epi.drop(columns = ['CHIK'])
y_lab = df_lab['CHIK']
y_epi = df_epi['CHIK']

In [22]:
print('Numbers from SINAN')
print('Dengue total - total: ', np.sum(y_lab == 0) + np.sum(y_epi == 0))
print('Chik total - total: ', np.sum(y_lab == 1) + np.sum(y_epi == 1))

Numbers from SINAN
Dengue total - total:  1200127
Chik total - total:  3293


In [23]:
y_pred_lab = clx.predict(X_lab) 
y_pred_epi = clx.predict(X_epi)

In [24]:
print('Numbers from Model')
print('Dengue total - total: ', np.sum(y_pred_lab == 0) + np.sum(y_pred_epi == 0))
print('Chik total - total: ', np.sum(y_pred_lab == 1) + np.sum(y_pred_epi == 1))

Numbers from Model
Dengue total - total:  920491
Chik total - total:  282929
