# Dengue Analysis:
---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import subprocess
import os
import sys
from utils.utils import (
    print_with_colors,
)

## Download:
---
* Download the Dataset from Web if not already downloaded.

In [None]:
if not os.path.exists("./raw_data"):
    os.makedirs("./raw_data")

if not os.path.exists("./raw_data/arbovirus_clinical_data"):
    # Download of .zip file
    url = "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/2d3kr8zynf-4.zip"
    output = f"./raw_data/dataset.zip"
    subprocess.run(["wget", "--quiet", "--no-check-certificate", url, "-O", output])

    # Extraction of .zip file
    subprocess.run(["unzip", output])
    subprocess.run(["mv", "./2d3kr8zynf-4", "./raw_data/arbovirus_clinical_data"])
    subprocess.run(["rm", output])

## Reading the Dataset:
---
* Using `chunksize` on `pd.read_csv()` method to use less RAM memory during reading

In [None]:
import warnings

missing_values = [
    '', ' ', 'NA', 'N/A', 'NULL',
    'ID_AGRAVO', 'DT_NOTIFIC', 'SEM_NOT', 'NU_ANO', 'SG_UF_NOT',
    'ID_MUNICIP', 'ID_REGIONA', 'ID_UNIDADE', 'DT_SIN_PRI', 'SEM_PRI',
    'DT_NASC', 'NU_IDADE_N', 'CS_SEXO', 'CS_GESTANT', 'CS_RACA',
    'CS_ESCOL_N', 'SG_UF', 'ID_MN_RESI', 'ID_RG_RESI', 'ID_PAIS',
    'DT_INVEST', 'FEBRE', 'MIALGIA', 'CEFALEIA', 'EXANTEMA',
    'VOMITO', 'NAUSEA', 'DOR_COSTAS', 'CONJUNTVIT', 'ARTRITE',
    'ARTRALGIA', 'PETEQUIA_N', 'LEUCOPENIA', 'LACO', 'DOR_RETRO',
    'DIABETES', 'HEMATOLOG', 'HEPATOPAT', 'RENAL', 'HIPERTENSA',
    'ACIDO_PEPT', 'AUTO_IMUNE', 'RESUL_SORO', 'RESUL_NS1', 'RESUL_VI_N',
    'RESUL_PCR_', 'HISTOPA_N', 'IMUNOH_N', 'HOSPITALIZ', 'TPAUTOCTO',
    'COUFINF', 'COPAISINF', 'COMUNINF', 'CLASSI_FIN', 'EVOLUCAO', 'DT_ENCERRA'
]

warnings.filterwarnings("ignore")
# Low memory safe reading of the CSV file
splitted_df = pd.read_csv(
    './raw_data/arbovirus_clinical_data/dengue.csv',
    sep=',',
    header=0,
    na_values=missing_values,
    chunksize=100_000,
)

# Concatenate all chunks into a single DataFrame
dengue_df = pd.concat(splitted_df, ignore_index=True)
warnings.filterwarnings("default")

* The file `attributes.csv` has important information about the features

In [None]:
attributes = pd.read_csv("raw_data/arbovirus_clinical_data/attributes.csv", sep=",", header=0, low_memory=False)
attributes = attributes.ffill()
attributes = attributes.groupby(["Attribute", "Description"])["Value"].apply('; '.join).reset_index(name="Values")

## Pre Processing
---
### Null Data Removal:
* Features with frequency > 60% of null values are dropped.
* Also, columns like `["CS_FLXRET", "TP_SISTEMA", "CRITERIO", "TP_NOT", "Unnamed: 0"]` doesn't have useful information, therefore they can be dropped.

In [None]:
dengue_df = dengue_df.loc[:, dengue_df.isnull().mean() < .60]
dengue_df = dengue_df.drop(columns=["CS_FLXRET", "TP_SISTEMA", "CRITERIO", "TP_NOT", "Unnamed: 0"])

* Printing unique values for each feature to check their data type.

In [None]:
for col in dengue_df.columns.to_list():
    if str(col) in attributes["Attribute"].to_list():
        print(f"Column '{col}' has {dengue_df[col].unique().size} unique values.")
        if dengue_df[col].unique().size < 50:
            print(dengue_df[col].unique(), end="\n\n")
        else:
            print("To many unique values, skipping...", end="\n\n")
    else:
        print_with_colors(f"Column '{col}' not in attributes. Skipping display...", "yellow", end="\n\n")

### Standardization of column values:
* Since the system has changed over the year, multiple codes were used to represent some types of Dengue. In the cell below we standardized these problem.

In [None]:
dengue_df['CLASSI_FIN'] = dengue_df['CLASSI_FIN'].astype('object')

dengue_df.loc[dengue_df['CLASSI_FIN']==1, 'CLASSI_FIN'] = 'Dengue'
dengue_df.loc[dengue_df['CLASSI_FIN']==10, 'CLASSI_FIN'] = 'Dengue'

dengue_df.loc[dengue_df['CLASSI_FIN']==3, 'CLASSI_FIN'] = 'Dengue Grave'
dengue_df.loc[dengue_df['CLASSI_FIN']==4, 'CLASSI_FIN'] = 'Dengue Grave'
dengue_df.loc[dengue_df['CLASSI_FIN']==12, 'CLASSI_FIN'] = 'Dengue Grave'

dengue_df.loc[dengue_df['CLASSI_FIN']==2, 'CLASSI_FIN'] = 'Dengue com sinais de alarme'
dengue_df.loc[dengue_df['CLASSI_FIN']==11, 'CLASSI_FIN'] = 'Dengue com sinais de alarme'

# Discarded/Inconclusive
dengue_df.loc[dengue_df['CLASSI_FIN']==5, 'CLASSI_FIN'] = 'Discarded/Inconclusive'
dengue_df.loc[dengue_df['CLASSI_FIN']==6, 'CLASSI_FIN'] = 'Discarded/Inconclusive'
dengue_df.loc[dengue_df['CLASSI_FIN']==8, 'CLASSI_FIN'] = 'Discarded/Inconclusive'

dengue_df['CLASSI_FIN'] = dengue_df['CLASSI_FIN'].fillna('Discarded/Inconclusive')
dengue_df['CLASSI_FIN'] = dengue_df['CLASSI_FIN'].astype('category')

### Null data padding with default values:
The resulting attributes that still had null data were entered with the default values referring to the data dictionary.

In [None]:
exam_cols = [
    "RESUL_SORO",
    "RESUL_NS1",
    "RESUL_VI_N",
    "RESUL_PCR_",
    "HISTOPA_N",
    "IMUNOH_N"
]
for col in exam_cols:
    if dengue_df[col].isnull().sum() > 0:
        dengue_df.loc[dengue_df[col].isnull(), col] = 4

dengue_df['CS_SEXO'] = dengue_df['CS_SEXO'].fillna('I')

# In the other attributes, the value of "not informed" is 9.
columns_to_be_filled = [
    col
    for col in dengue_df.columns
    if col not in exam_cols
    and 'DT_' not in str(col) # for datetime columns it doesn't make sense
    and not 'CS_SEXO'.__eq__(str(col)) # CS_SEXO has the special value 'I' for NaNs
]
for col in columns_to_be_filled:
    if dengue_df[col].isnull().sum() > 0:
        dengue_df.loc[dengue_df[col].isnull(), col] = 9


Removing columns 'ID_AGRAVO', because it has only 1 fixed value: `A90`

In [None]:
dengue_df = dengue_df.drop(columns=['ID_AGRAVO'])

In [None]:
dtypes = {
    'SEM_NOT': 'int32',
    'NU_ANO': 'int16',
    'SG_UF_NOT': 'category',
    'ID_MUNICIP': 'category',
    'ID_REGIONA': 'category',
    'ID_UNIDADE': 'category',
    'SEM_PRI': 'int32',
    'NU_IDADE_N': 'int8',
    'CS_SEXO': 'category',
    'CS_GESTANT': 'category',
    'CS_RACA': 'category',
    'CS_ESCOL_N': 'category',
    'SG_UF': 'category',
    'ID_MN_RESI': 'category',
    'ID_RG_RESI': 'category',
    'ID_PAIS': 'category',
    'FEBRE': 'category',
    'MIALGIA': 'category',
    'CEFALEIA': 'category',
    'EXANTEMA': 'category',
    'VOMITO': 'category',
    'NAUSEA': 'category',
    'DOR_COSTAS': 'category',
    'CONJUNTVIT': 'category',
    'ARTRITE': 'category',
    'ARTRALGIA': 'category',
    'PETEQUIA_N': 'category',
    'LEUCOPENIA': 'category',
    'LACO': 'category',
    'DOR_RETRO': 'category',
    'DIABETES': 'category',
    'HEMATOLOG': 'category',
    'HEPATOPAT': 'category',
    'RENAL': 'category',
    'HIPERTENSA': 'category',
    'ACIDO_PEPT': 'category',
    'AUTO_IMUNE': 'category',
    'RESUL_SORO': 'category',
    'RESUL_NS1': 'category',
    'RESUL_VI_N': 'category',
    'RESUL_PCR_': 'category',
    'HISTOPA_N': 'category',
    'IMUNOH_N': 'category',
    'HOSPITALIZ': 'category',
    'TPAUTOCTO': 'category',
    'COUFINF': 'category',
    'COPAISINF': 'category',
    'COMUNINF': 'category',
    'CLASSI_FIN': 'object',
    'EVOLUCAO': 'category',
}

### Setting dtypes to columns:

In [None]:
date_cols = ['DT_NOTIFIC', 'DT_SIN_PRI', 'DT_NASC', 'DT_INVEST', 'DT_ENCERRA']
for col in date_cols:
    if col in dengue_df.columns:
        dengue_df[col] = pd.to_datetime(dengue_df[col], errors='coerce')

dengue_df['SEM_PRI'] = dengue_df['SEM_PRI'].apply(lambda x: x.replace('-', '') if isinstance(x, str) else x)
dengue_df = dengue_df.astype(dtypes)