In [144]:
# usual imports in a classic ML pipeline for Classification
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# additional metrics ONLY for classification
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [145]:
# load the data
df = pd.read_csv("synthetic_asthma_dataset.csv")


# let's quickly see the first 5 rows of data
df

Unnamed: 0,Patient_ID,Age,Gender,BMI,Smoking_Status,Family_History,Allergies,Air_Pollution_Level,Physical_Activity_Level,Occupation_Type,Comorbidities,Medication_Adherence,Number_of_ER_Visits,Peak_Expiratory_Flow,FeNO_Level,Has_Asthma,Asthma_Control_Level
0,ASTH100000,52,Female,27.6,Former,1,,Moderate,Sedentary,Outdoor,Diabetes,0.38,0,421.0,46.0,0,
1,ASTH100001,15,Male,24.6,Former,0,Dust,Low,Moderate,Indoor,Both,0.60,2,297.6,22.9,0,
2,ASTH100002,72,Female,17.6,Never,0,,Moderate,Moderate,Indoor,,0.38,0,303.3,15.3,0,
3,ASTH100003,61,Male,16.8,Never,0,Multiple,High,Sedentary,Outdoor,Both,0.60,1,438.0,40.1,1,Poorly Controlled
4,ASTH100004,21,Male,30.2,Never,0,,Moderate,Active,Indoor,,0.82,3,535.0,27.7,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,ASTH109995,70,Male,25.0,Never,0,,Low,Sedentary,Indoor,,0.67,0,580.6,18.7,0,
9996,ASTH109996,78,Female,24.8,Never,0,Pollen,Low,Moderate,Indoor,Diabetes,0.72,1,417.6,40.8,0,
9997,ASTH109997,58,Male,30.1,Former,1,Pollen,Low,Moderate,Indoor,,0.28,0,459.1,20.3,1,Not Controlled
9998,ASTH109998,88,Female,31.2,Former,0,Pollen,Moderate,Moderate,Indoor,,0.44,0,415.9,25.0,0,


In [146]:
# Check total no of records
df.shape

(10000, 17)

In [147]:
# drop Patient_ID field since it is not required
df.drop(columns=['Patient_ID'], inplace=True)
df.head()

Unnamed: 0,Age,Gender,BMI,Smoking_Status,Family_History,Allergies,Air_Pollution_Level,Physical_Activity_Level,Occupation_Type,Comorbidities,Medication_Adherence,Number_of_ER_Visits,Peak_Expiratory_Flow,FeNO_Level,Has_Asthma,Asthma_Control_Level
0,52,Female,27.6,Former,1,,Moderate,Sedentary,Outdoor,Diabetes,0.38,0,421.0,46.0,0,
1,15,Male,24.6,Former,0,Dust,Low,Moderate,Indoor,Both,0.6,2,297.6,22.9,0,
2,72,Female,17.6,Never,0,,Moderate,Moderate,Indoor,,0.38,0,303.3,15.3,0,
3,61,Male,16.8,Never,0,Multiple,High,Sedentary,Outdoor,Both,0.6,1,438.0,40.1,1,Poorly Controlled
4,21,Male,30.2,Never,0,,Moderate,Active,Indoor,,0.82,3,535.0,27.7,0,


In [148]:
# check duplicates and missing values
df.duplicated().sum()

np.int64(0)

In [149]:
# do we have missing values? apparently not
# dataset can't have any missing values when passing the data
# to the machine learning algorithm
df.isna().sum()

Age                           0
Gender                        0
BMI                           0
Smoking_Status                0
Family_History                0
Allergies                  2936
Air_Pollution_Level           0
Physical_Activity_Level       0
Occupation_Type               0
Comorbidities              4967
Medication_Adherence          0
Number_of_ER_Visits           0
Peak_Expiratory_Flow          0
FeNO_Level                    0
Has_Asthma                    0
Asthma_Control_Level       7567
dtype: int64

In [150]:
# Drop the columns Comorbidities, Asthma_Control_Level
df = df.drop(columns=['Comorbidities', 'Asthma_Control_Level'])
df.isna().sum()

Age                           0
Gender                        0
BMI                           0
Smoking_Status                0
Family_History                0
Allergies                  2936
Air_Pollution_Level           0
Physical_Activity_Level       0
Occupation_Type               0
Medication_Adherence          0
Number_of_ER_Visits           0
Peak_Expiratory_Flow          0
FeNO_Level                    0
Has_Asthma                    0
dtype: int64

In [151]:
# Check value counts for Allergies:
df['Allergies'].value_counts()


Allergies
Dust        2479
Pollen      1999
Pets        1585
Multiple    1001
Name: count, dtype: int64

In [152]:
# Example: replace NaN with None in Allergies
df['Allergies'] = df['Allergies'].fillna("None")


In [153]:
# do we have missing values? apparently not
# dataset can't have any missing values when passing the data
# to the machine learning algorithm
df.isna().sum()

Age                        0
Gender                     0
BMI                        0
Smoking_Status             0
Family_History             0
Allergies                  0
Air_Pollution_Level        0
Physical_Activity_Level    0
Occupation_Type            0
Medication_Adherence       0
Number_of_ER_Visits        0
Peak_Expiratory_Flow       0
FeNO_Level                 0
Has_Asthma                 0
dtype: int64

In [154]:
# Check value counts for Allergies:
df['Allergies'].value_counts()

Allergies
None        2936
Dust        2479
Pollen      1999
Pets        1585
Multiple    1001
Name: count, dtype: int64

In [155]:
df.head()

Unnamed: 0,Age,Gender,BMI,Smoking_Status,Family_History,Allergies,Air_Pollution_Level,Physical_Activity_Level,Occupation_Type,Medication_Adherence,Number_of_ER_Visits,Peak_Expiratory_Flow,FeNO_Level,Has_Asthma
0,52,Female,27.6,Former,1,,Moderate,Sedentary,Outdoor,0.38,0,421.0,46.0,0
1,15,Male,24.6,Former,0,Dust,Low,Moderate,Indoor,0.6,2,297.6,22.9,0
2,72,Female,17.6,Never,0,,Moderate,Moderate,Indoor,0.38,0,303.3,15.3,0
3,61,Male,16.8,Never,0,Multiple,High,Sedentary,Outdoor,0.6,1,438.0,40.1,1
4,21,Male,30.2,Never,0,,Moderate,Active,Indoor,0.82,3,535.0,27.7,0
