In [2]:
%matplotlib inline

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data Cleaning and Preprocessing

In [4]:
asthma_data = pd.read_csv("../data/asthma_data_renamed_columns.csv")
asthma_data

Unnamed: 0,patientid,age,gender,ethnicity,education_level,bmi,smoking,physical_activity,diet_quality,sleep_quality,...,lung_functionfev1,lung_functionfvc,wheezing,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis,doctor_in_charge
0,5034,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,...,1.369051,4.941206,0,0,1,0,0,1,0,Dr_Confid
1,5035,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,...,2.197767,1.702393,1,0,0,1,1,1,0,Dr_Confid
2,5036,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,...,1.698011,5.022553,1,1,1,0,1,1,0,Dr_Confid
3,5037,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,...,3.032037,2.300159,1,0,1,1,1,0,0,Dr_Confid
4,5038,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,...,3.470589,3.067944,1,1,1,0,0,1,0,Dr_Confid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,7421,43,1,0,2,29.059613,0,3.019854,6.119637,8.300960,...,3.125249,5.166032,0,1,0,0,0,1,1,Dr_Confid
2388,7422,18,1,0,1,20.740850,0,5.805180,4.386992,7.731192,...,1.132977,5.509502,0,0,0,1,1,0,1,Dr_Confid
2389,7423,54,0,3,2,37.079560,0,4.735169,8.214064,7.483521,...,1.685962,3.346877,1,0,1,1,0,1,1,Dr_Confid
2390,7424,46,1,0,2,23.444712,0,9.672637,7.362861,6.717272,...,3.481549,1.713274,0,1,1,0,1,1,0,Dr_Confid


In [5]:
asthma_data.dtypes

patientid                    int64
age                          int64
gender                       int64
ethnicity                    int64
education_level              int64
bmi                        float64
smoking                      int64
physical_activity          float64
diet_quality               float64
sleep_quality              float64
pollution_exposure         float64
pollen_exposure            float64
dust_exposure              float64
pet_allergy                  int64
family_history_asthma        int64
history_of_allergies         int64
eczema                       int64
hay_fever                    int64
gastroesophageal_reflux      int64
lung_functionfev1          float64
lung_functionfvc           float64
wheezing                     int64
shortness_of_breath          int64
chest_tightness              int64
coughing                     int64
nighttime_symptoms           int64
exercise_induced             int64
diagnosis                    int64
doctor_in_charge    

In [6]:
# checking for missing data

asthma_data.replace(['NA', 'N/A'], pd.NA, inplace = True)
missing_values = asthma_data.isna().sum()
valid_values = asthma_data.notna().sum()

missing_values, valid_values

(patientid                  0
 age                        0
 gender                     0
 ethnicity                  0
 education_level            0
 bmi                        0
 smoking                    0
 physical_activity          0
 diet_quality               0
 sleep_quality              0
 pollution_exposure         0
 pollen_exposure            0
 dust_exposure              0
 pet_allergy                0
 family_history_asthma      0
 history_of_allergies       0
 eczema                     0
 hay_fever                  0
 gastroesophageal_reflux    0
 lung_functionfev1          0
 lung_functionfvc           0
 wheezing                   0
 shortness_of_breath        0
 chest_tightness            0
 coughing                   0
 nighttime_symptoms         0
 exercise_induced           0
 diagnosis                  0
 doctor_in_charge           0
 dtype: int64,
 patientid                  2392
 age                        2392
 gender                     2392
 ethnicity      

There are no missing values.

## Doctor in charge

In [7]:
asthma_data.doctor_in_charge.value_counts()

doctor_in_charge
Dr_Confid    2392
Name: count, dtype: int64

In all cases, there is not information about the doctor in charge. This column and that about the patient ID bear no information for the data analysis, so they can be deleted.

In [10]:
asthma_data = asthma_data.drop(columns=["patientid", "doctor_in_charge"])
asthma_data

Unnamed: 0,age,gender,ethnicity,education_level,bmi,smoking,physical_activity,diet_quality,sleep_quality,pollution_exposure,...,gastroesophageal_reflux,lung_functionfev1,lung_functionfvc,wheezing,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis
0,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,7.388481,...,0,1.369051,4.941206,0,0,1,0,0,1,0
1,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,1.969838,...,0,2.197767,1.702393,1,0,0,1,1,1,0
2,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,1.460593,...,0,1.698011,5.022553,1,1,1,0,1,1,0
3,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,0.581905,...,0,3.032037,2.300159,1,0,1,1,1,0,0
4,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,0.980875,...,0,3.470589,3.067944,1,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,43,1,0,2,29.059613,0,3.019854,6.119637,8.300960,2.483829,...,0,3.125249,5.166032,0,1,0,0,0,1,1
2388,18,1,0,1,20.740850,0,5.805180,4.386992,7.731192,7.733983,...,0,1.132977,5.509502,0,0,0,1,1,0,1
2389,54,0,3,2,37.079560,0,4.735169,8.214064,7.483521,2.794847,...,0,1.685962,3.346877,1,0,1,1,0,1,1
2390,46,1,0,2,23.444712,0,9.672637,7.362861,6.717272,9.448862,...,1,3.481549,1.713274,0,1,1,0,1,1,0


In [11]:
asthma_data.to_csv('../data/asthma_data_reduced.csv', index = False)