In [5]:
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
data = pd.read_csv("../data/asthma_disease_data.csv")

In [7]:
data

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,...,LungFunctionFEV1,LungFunctionFVC,Wheezing,ShortnessOfBreath,ChestTightness,Coughing,NighttimeSymptoms,ExerciseInduced,Diagnosis,DoctorInCharge
0,5034,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,...,1.369051,4.941206,0,0,1,0,0,1,0,Dr_Confid
1,5035,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,...,2.197767,1.702393,1,0,0,1,1,1,0,Dr_Confid
2,5036,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,...,1.698011,5.022553,1,1,1,0,1,1,0,Dr_Confid
3,5037,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,...,3.032037,2.300159,1,0,1,1,1,0,0,Dr_Confid
4,5038,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,...,3.470589,3.067944,1,1,1,0,0,1,0,Dr_Confid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,7421,43,1,0,2,29.059613,0,3.019854,6.119637,8.300960,...,3.125249,5.166032,0,1,0,0,0,1,1,Dr_Confid
2388,7422,18,1,0,1,20.740850,0,5.805180,4.386992,7.731192,...,1.132977,5.509502,0,0,0,1,1,0,1,Dr_Confid
2389,7423,54,0,3,2,37.079560,0,4.735169,8.214064,7.483521,...,1.685962,3.346877,1,0,1,1,0,1,1,Dr_Confid
2390,7424,46,1,0,2,23.444712,0,9.672637,7.362861,6.717272,...,3.481549,1.713274,0,1,1,0,1,1,0,Dr_Confid


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   PatientID               2392 non-null   int64  
 1   Age                     2392 non-null   int64  
 2   Gender                  2392 non-null   int64  
 3   Ethnicity               2392 non-null   int64  
 4   EducationLevel          2392 non-null   int64  
 5   BMI                     2392 non-null   float64
 6   Smoking                 2392 non-null   int64  
 7   PhysicalActivity        2392 non-null   float64
 8   DietQuality             2392 non-null   float64
 9   SleepQuality            2392 non-null   float64
 10  PollutionExposure       2392 non-null   float64
 11  PollenExposure          2392 non-null   float64
 12  DustExposure            2392 non-null   float64
 13  PetAllergy              2392 non-null   int64  
 14  FamilyHistoryAsthma     2392 non-null   

The dataset has no null values and all columns are of numeric type. I do see some columns which have boolean values, meaning they can be transformed to bool type, instead of int64. Lets check all columns that contain boolean.

In [10]:
for column in data.columns:
    # Get the number of unique values in the column
    unique_count = data[column].nunique()
    
    # Check if the column has fewer than 5 unique values
    if unique_count == 2:
        print(f"Value counts for column '{column}' (Unique values: {unique_count}):")
        print(data[column].value_counts())
        print()

Value counts for column 'Gender' (Unique values: 2):
Gender
0    1212
1    1180
Name: count, dtype: int64

Value counts for column 'Smoking' (Unique values: 2):
Smoking
0    2053
1     339
Name: count, dtype: int64

Value counts for column 'PetAllergy' (Unique values: 2):
PetAllergy
0    1995
1     397
Name: count, dtype: int64

Value counts for column 'FamilyHistoryAsthma' (Unique values: 2):
FamilyHistoryAsthma
0    1672
1     720
Name: count, dtype: int64

Value counts for column 'HistoryOfAllergies' (Unique values: 2):
HistoryOfAllergies
0    1437
1     955
Name: count, dtype: int64

Value counts for column 'Eczema' (Unique values: 2):
Eczema
0    1933
1     459
Name: count, dtype: int64

Value counts for column 'HayFever' (Unique values: 2):
HayFever
0    1786
1     606
Name: count, dtype: int64

Value counts for column 'GastroesophagealReflux' (Unique values: 2):
GastroesophagealReflux
0    2014
1     378
Name: count, dtype: int64

Value counts for column 'Wheezing' (Unique value

It looks like quite a lot of columns contain boolean data. Most of these columns are symptoms that the patient either has or doesn't have and some factors that could contribute to the disease. Out of the symptoms shortness of breath, coughing and chest tightness are the most common among patients.

In [12]:
data.columns

Index(['PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI',
       'Smoking', 'PhysicalActivity', 'DietQuality', 'SleepQuality',
       'PollutionExposure', 'PollenExposure', 'DustExposure', 'PetAllergy',
       'FamilyHistoryAsthma', 'HistoryOfAllergies', 'Eczema', 'HayFever',
       'GastroesophagealReflux', 'LungFunctionFEV1', 'LungFunctionFVC',
       'Wheezing', 'ShortnessOfBreath', 'ChestTightness', 'Coughing',
       'NighttimeSymptoms', 'ExerciseInduced', 'Diagnosis', 'DoctorInCharge'],
      dtype='object')

The column names are currently in camel case. I will later rename them to snake case.

In [14]:
data.Ethnicity.value_counts() #This column gives no information about the name of the ethnicity, just the count. That gives us no information.

Ethnicity
0    1465
1     475
2     229
3     223
Name: count, dtype: int64

In [15]:
data.EducationLevel.value_counts() #Same goes for the education...

EducationLevel
1    933
2    749
0    478
3    232
Name: count, dtype: int64

In [16]:
data.DoctorInCharge.value_counts() #Looks like this dataset doesn't give any information about the doctor who treats the patient, so it is pretty much useless.

DoctorInCharge
Dr_Confid    2392
Name: count, dtype: int64

Lets take a look at the BMI distribution. It is pretty balanced across the BMI range.