In [1]:
import warnings
warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv("Dengue_dataset.csv")
df.head()

Unnamed: 0,Age,Gender,Fever,Cough,Headache,Body ache,Abdominal pain,Temperature,Vomiting or Nausea,Generalized Weakness,...,Bleeding,Blood pressure(systolic),Blood pressure(diastolic),Pulse,Platelet,RBS,Anaemia,Dehydration,NS1,Unnamed: 20
0,25.0,Male,Yes,No,No,Yes,No,,No,Yes,...,No,100.0,70.0,90.0,45000.0,,Yes,No,Yes,
1,21.0,Female,Yes,No,Yes,Yes,No,,Yes,Yes,...,No,110.0,70.0,68.0,90000.0,5.6,Yes,No,Yes,
2,28.0,Male,Yes,No,Yes,No,Yes,,No,Yes,...,No,110.0,90.0,88.0,,,No,No,Yes,
3,40.0,Female,No,Yes,Yes,No,No,,No,Yes,...,No,120.0,100.0,141.0,76000.0,6.67,No,No,Yes,
4,60.0,Female,Yes,Yes,No,No,Yes,,No,Yes,...,No,100.0,70.0,62.0,55000.0,8.6,Yes,Yes,Yes,


# Renaming Columns

In [3]:
df.drop(["Unnamed: 20"], axis = 1, inplace = True)

df["Systolic"] = df["Blood pressure(systolic)"]
df.drop(["Blood pressure(systolic)"], axis = 1, inplace = True)

df["Diastolic"] = df["Blood pressure(diastolic)"]
df.drop(["Blood pressure(diastolic)"], axis = 1, inplace = True)

df["Weakness"] = df["Generalized Weakness"]
df.drop(["Generalized Weakness"], axis = 1, inplace = True)

df["Vomiting_Nausea"] = df["Vomiting or Nausea"]
df.drop(["Vomiting or Nausea"], axis = 1, inplace = True)

df["Abdominal_pain"] = df["Abdominal pain"]
df.drop(["Abdominal pain"], axis = 1, inplace = True)


df.columns

Index(['Age', 'Gender', 'Fever', 'Cough', 'Headache', 'Body ache',
       'Temperature', 'Diarrhea', 'Bleeding', 'Pulse', 'Platelet', 'RBS',
       'Anaemia', 'Dehydration', 'NS1', 'Systolic', 'Diastolic', 'Weakness',
       'Vomiting_Nausea', 'Abdominal_pain'],
      dtype='object')

# Convert categorical Values

In [4]:
numerical_columns = ['Age','Temperature','Pulse', 'Platelet', 'RBS','Systolic', 'Diastolic']

mapping = {
    "Yes": 1, "No": 0,
    "Male": 1, "Female": 0
}

def convert_values(df, columns):
    for column in columns:
        df[column] = df[column].replace(mapping)

Categorical_columns = [
    "Gender", "Fever", "Cough", "Headache", "Body ache","Abdominal_pain","Vomiting_Nausea", 
    "Weakness","Diarrhea", "Bleeding", "Anaemia", "Dehydration", "NS1"
]

convert_values(df, Categorical_columns)
df.head()

Unnamed: 0,Age,Gender,Fever,Cough,Headache,Body ache,Temperature,Diarrhea,Bleeding,Pulse,Platelet,RBS,Anaemia,Dehydration,NS1,Systolic,Diastolic,Weakness,Vomiting_Nausea,Abdominal_pain
0,25.0,1,1.0,0.0,0.0,1.0,,0.0,0.0,90.0,45000.0,,1.0,0.0,1,100.0,70.0,1.0,0.0,0
1,21.0,0,1.0,0.0,1.0,1.0,,0.0,0.0,68.0,90000.0,5.6,1.0,0.0,1,110.0,70.0,1.0,1.0,0
2,28.0,1,1.0,0.0,1.0,0.0,,1.0,0.0,88.0,,,0.0,0.0,1,110.0,90.0,1.0,0.0,1
3,40.0,0,0.0,1.0,1.0,0.0,,0.0,0.0,141.0,76000.0,6.67,0.0,0.0,1,120.0,100.0,1.0,0.0,0
4,60.0,0,1.0,1.0,0.0,0.0,,1.0,0.0,62.0,55000.0,8.6,1.0,1.0,1,100.0,70.0,1.0,0.0,1


# Filling missing values

In [5]:
df.isna().sum()

Age                  1
Gender               0
Fever                1
Cough                3
Headache             2
Body ache            2
Temperature        220
Diarrhea             3
Bleeding             1
Pulse               30
Platelet           187
RBS                312
Anaemia              2
Dehydration          1
NS1                  0
Systolic             5
Diastolic            4
Weakness             1
Vomiting_Nausea      1
Abdominal_pain       0
dtype: int64

In [6]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)

strategies = {
    "Age": "mean",                 "Systolic": "mean",           "Diastolic": "mean",
    "Fever": "ffill",              "Cough": "ffill",             "Headache": "ffill", 
    "Body ache": "ffill",          "Diarrhea": "ffill",          "Bleeding": "ffill",
    "Anaemia": "ffill",            "Dehydration": "ffill",       "Weakness": "ffill",
    "Vomiting_Nausea": "ffill",    "Abdominal_pain": "ffill",    "Temperature": "KNN",
    "Platelet": "KNN",             "RBS": "KNN",                 "Pulse": "KNN"
}

for col, strategy in strategies.items():
    if strategy == "mean":
        df[col].fillna(df[col].mean(), inplace=True)
    elif strategy == "ffill":
        df[col].fillna(method=strategy, inplace=True)
    elif strategy == "KNN":
        df[col] = imputer.fit_transform(df[[col]])

# Check for missing values
print(df.isna().sum())

Age                0
Gender             0
Fever              0
Cough              0
Headache           0
Body ache          0
Temperature        0
Diarrhea           0
Bleeding           0
Pulse              0
Platelet           0
RBS                0
Anaemia            0
Dehydration        0
NS1                0
Systolic           0
Diastolic          0
Weakness           0
Vomiting_Nausea    0
Abdominal_pain     0
dtype: int64


In [7]:
df.dtypes

Age                float64
Gender               int64
Fever              float64
Cough              float64
Headache           float64
Body ache          float64
Temperature        float64
Diarrhea           float64
Bleeding           float64
Pulse              float64
Platelet           float64
RBS                float64
Anaemia            float64
Dehydration        float64
NS1                  int64
Systolic           float64
Diastolic          float64
Weakness           float64
Vomiting_Nausea    float64
Abdominal_pain       int64
dtype: object

In [8]:
df.head()

Unnamed: 0,Age,Gender,Fever,Cough,Headache,Body ache,Temperature,Diarrhea,Bleeding,Pulse,Platelet,RBS,Anaemia,Dehydration,NS1,Systolic,Diastolic,Weakness,Vomiting_Nausea,Abdominal_pain
0,25.0,1,1.0,0.0,0.0,1.0,100.101796,0.0,0.0,90.0,45000.0,9.569364,1.0,0.0,1,100.0,70.0,1.0,0.0,0
1,21.0,0,1.0,0.0,1.0,1.0,100.101796,0.0,0.0,68.0,90000.0,5.6,1.0,0.0,1,110.0,70.0,1.0,1.0,0
2,28.0,1,1.0,0.0,1.0,0.0,100.101796,1.0,0.0,88.0,115801.561798,9.569364,0.0,0.0,1,110.0,90.0,1.0,0.0,1
3,40.0,0,0.0,1.0,1.0,0.0,100.101796,0.0,0.0,141.0,76000.0,6.67,0.0,0.0,1,120.0,100.0,1.0,0.0,0
4,60.0,0,1.0,1.0,0.0,0.0,100.101796,1.0,0.0,62.0,55000.0,8.6,1.0,1.0,1,100.0,70.0,1.0,0.0,1


In [9]:
df.to_csv("clean_data.csv")

# Scale Dataset

In [10]:
des_analysis = df[numerical_columns].describe().T
des_analysis.drop(["count"], axis = 1,inplace = True)
des_analysis[["min", "max"]]

Unnamed: 0,min,max
Age,12.0,110.0
Temperature,96.0,105.0
Pulse,40.0,160.0
Platelet,8000.0,2235000.0
RBS,1.3,70.0
Systolic,70.0,240.0
Diastolic,30.0,130.0


# missing value
# outlier detection
# feature selection
# model selection

In [11]:
312+187+200

699

In [12]:
df = pd.read_csv("clean_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,Fever,Cough,Headache,Body ache,Temperature,Diarrhea,Bleeding,...,Platelet,RBS,Anaemia,Dehydration,NS1,Systolic,Diastolic,Weakness,Vomiting_Nausea,Abdominal_pain
0,0,25.0,1,1.0,0.0,0.0,1.0,100.101796,0.0,0.0,...,45000.0,9.569364,1.0,0.0,1,100.0,70.0,1.0,0.0,0
1,1,21.0,0,1.0,0.0,1.0,1.0,100.101796,0.0,0.0,...,90000.0,5.6,1.0,0.0,1,110.0,70.0,1.0,1.0,0
2,2,28.0,1,1.0,0.0,1.0,0.0,100.101796,1.0,0.0,...,115801.561798,9.569364,0.0,0.0,1,110.0,90.0,1.0,0.0,1
3,3,40.0,0,0.0,1.0,1.0,0.0,100.101796,0.0,0.0,...,76000.0,6.67,0.0,0.0,1,120.0,100.0,1.0,0.0,0
4,4,60.0,0,1.0,1.0,0.0,0.0,100.101796,1.0,0.0,...,55000.0,8.6,1.0,1.0,1,100.0,70.0,1.0,0.0,1
