# Binary Classification of Dengue/Chikungunya Symptoms with Logistic Regression

- Simple algorithm, a test run of a possible classification project of dengue/chikungunya based on the different symptoms and combinations of symptoms of each disease
- Data on Dengue and Chikungunya from SINAN-2024
- Conclusion: performance could be better, lots of dengue cases are classified as chikungunya. Therefore, lots of false positives. However, it isn't entirely bad.

In [1]:
# standard libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# ML libraries and functions
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RepeatedStratifiedKFold, StratifiedKFold
import graphviz 
from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve, roc_auc_score, precision_score, recall_score, precision_recall_curve
from sklearn.metrics import f1_score

In [2]:
# Loading the data 

df_dengue = pd.read_csv('dengue_SP.csv')
df_chik = pd.read_csv('chik_SP.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'dengue_SP.csv'

In [None]:
# Visualizing the data, making sure of the possible values
df_dengue.describe()

In [None]:
# Removing the cases under investigation
df_dengue = df_dengue[df_dengue['CRITERIO'] != 3]
df_chik = df_chik[df_chik['CRITERIO'] != 3]

In [None]:
print(df_dengue.shape)
print(df_chik.shape)

In [None]:
# Separating the laboratorial vs the clinic/epidemiologic criteria

df_dengue_lab = df_dengue[df_dengue['CRITERIO'] == 1]
df_chik_lab = df_chik[df_chik['CRITERIO'] == 1]

df_dengue_epi = df_dengue[df_dengue['CRITERIO'] == 2]
df_chik_epi = df_chik[df_chik['CRITERIO'] == 2]

In [None]:
df_dengue_lab.shape

In [None]:
df_chik_lab.shape

In [None]:
851496/5920

In [None]:
# Doing some extra pre-processing

## removing the criteria
df_dengue_lab = df_dengue_lab.drop(columns = ['CRITERIO'])
df_chik_lab = df_chik_lab.drop(columns = ['CRITERIO'])
df_dengue_epi = df_dengue_epi.drop(columns = ['CRITERIO'])
df_chik_epi = df_chik_epi.drop(columns = ['CRITERIO'])

## replacing all 2 with 0 in the symptoms
df_dengue_lab = df_dengue_lab.replace(2,0)
df_chik_lab = df_chik_lab.replace(2,0)
df_dengue_epi = df_dengue_epi.replace(2,0)
df_chik_epi = df_chik_epi.replace(2,0)

In [None]:
df_dengue_lab.sum(axis = 0)

In [None]:
df_chik_lab.sum(axis = 0)

In [None]:
# Defining a new variable indicating for dengue
df_dengue_lab['CHIK'] = 0
df_dengue_epi['CHIK'] = 0
df_chik_lab['CHIK'] = 1
df_chik_epi['CHIK'] = 1

In [None]:
# Defining the full laboratorial dataset
df_lab = pd.concat([df_dengue_lab,df_chik_lab])

# Defining the full epidemiological dataset
df_epi = pd.concat([df_dengue_epi,df_chik_epi])

## Defining Train and Test Datasets

In [None]:
# Defining target and features
X = df_lab.drop(columns = ['CHIK'])
y = df_lab['CHIK']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = 0.3)

In [None]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

## Training Logistic Regression

In [21]:
# define model
lg1 = LogisticRegression(random_state=13, class_weight=None)
# fit it
lg1.fit(X_train,y_train)
# test
y_pred = lg1.predict(X_test)
# performance'
print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')

Accuracy Score: 0.9930955389250656
Confusion Matrix: 
[[255449      0]
 [  1776      0]]
Area Under Curve: 0.5
Recall score: 0.0


### Weighted logistic regression

In [22]:
# define class weights
w = {0:1, 1:150}
# define model
lg2 = LogisticRegression(random_state=13, class_weight=w)
# fit it
lg2.fit(X_train,y_train)
# test
y_pred = lg2.predict(X_test)
# performance
print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')

Accuracy Score: 0.7867586743123725
Confusion Matrix: 
[[201221  54228]
 [   623   1153]]
Area Under Curve: 0.7184633381713081
Recall score: 0.6492117117117117


### Grid Search logistic regression

In [24]:
# define weight hyperparameter
w = [{0:1,1:100},{0:1,1:200},{0:1,1:500},{0:1,1:600}]
hyperparam_grid = {"class_weight": w }

# define model
lg3 = LogisticRegression(random_state=13)
# define evaluation procedure
grid = GridSearchCV(lg3,hyperparam_grid,scoring="roc_auc", n_jobs=1, refit=True)
grid.fit(X_train,y_train)
print(f'Best score: {grid.best_score_} with param: {grid.best_params_}')

Best score: 0.769782618398816 with param: {'class_weight': {0: 1, 1: 500}}


### Hyperparameter search logistic regression

In [52]:
# define hyperparameters
w = [{0:1,1:100},{0:1,1:200},{0:1,1:500}]
crange = np.arange(0.5,20.0, 0.5)
hyperparam_grid = {"class_weight": w
                   ,"C": crange
                   ,"fit_intercept": [True, False]  }

# logistic model classifier
lg4 = LogisticRegression(random_state=13)
# define evaluation procedure
grid = GridSearchCV(lg4,hyperparam_grid,scoring="roc_auc", n_jobs=1, refit=True)
grid.fit(X_train,y_train)
print(f'Best score: {grid.best_score_} with param: {grid.best_params_}')

Best score: 0.769547981917546 with param: {'C': 2.0, 'class_weight': {0: 1, 1: 500}, 'fit_intercept': True}


In [53]:
y_pred = grid.predict(X_test)
# performance
print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_pred)}')
print(f'Recall score: {recall_score(y_test,y_pred)}')

Accuracy Score: 0.1083331713480416
Confusion Matrix: 
[[ 26148 229301]
 [    58   1718]]
Area Under Curve: 0.5348516416369001
Recall score: 0.9673423423423423


## Doing a counterfactual of all cases 

### Number of chik and dengue - total

In [24]:
X_lab = df_lab.drop(columns = ['CHIK'])
X_epi = df_epi.drop(columns = ['CHIK'])
y_lab = df_lab['CHIK']
y_epi = df_epi['CHIK']

In [27]:
print('Numbers from SINAN')
print('Dengue total - total: ', np.sum(y_lab == 0) + np.sum(y_epi == 0))
print('Chik total - total: ', np.sum(y_lab == 1) + np.sum(y_epi == 1))

Numbers from SINAN
Dengue total - total:  1754905
Chik total - total:  1150


In [28]:
y_pred_lab = grid.predict(X_lab) 
y_pred_epi = grid.predict(X_epi)

In [32]:
print('Numbers from Model')
print('Dengue total - total: ', np.sum(y_pred_lab == 0) + np.sum(y_pred_epi == 0))
print('Chik total - total: ', np.sum(y_pred_lab == 1) + np.sum(y_pred_epi == 1))

Numbers from Model
Dengue total - total:  1420752
Chik total - total:  335303
