In [1]:
# Load local libraries
import __load_libs
from src.df_io import to_csv

import pandas as pd

# Problem 2. Data Exploration

## Load data

In [2]:
patients_df = pd.read_csv('../data/asthma_disease_data.csv')
patients_df.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,...,LungFunctionFEV1,LungFunctionFVC,Wheezing,ShortnessOfBreath,ChestTightness,Coughing,NighttimeSymptoms,ExerciseInduced,Diagnosis,DoctorInCharge
0,5034,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,...,1.369051,4.941206,0,0,1,0,0,1,0,Dr_Confid
1,5035,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,...,2.197767,1.702393,1,0,0,1,1,1,0,Dr_Confid
2,5036,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,...,1.698011,5.022553,1,1,1,0,1,1,0,Dr_Confid
3,5037,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,...,3.032037,2.300159,1,0,1,1,1,0,0,Dr_Confid
4,5038,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,...,3.470589,3.067944,1,1,1,0,0,1,0,Dr_Confid


## Data structure

### Rename columns

In [3]:
patients_df = patients_df.rename(columns={
    'PatientID': 'patient_id',
    'Age': 'age',
    'Gender': 'gender',
    'Ethnicity': 'ethnicity',
    'EducationLevel': 'education_level',
    'BMI': 'bmi',
    'Smoking': 'smoking',
    'PhysicalActivity': 'physical_activity',
    'DietQuality': 'diet_quality',
    'SleepQuality': 'sleep_quality',
    'PollutionExposure': 'pollution_exposure',
    'PollenExposure': 'pollen_exposure',
    'DustExposure': 'dust_exposure',
    'PetAllergy': 'pet_allergy',
    'FamilyHistoryAsthma': 'family_history_asthma',
    'HistoryOfAllergies': 'history_of_allergies',
    'Eczema': 'eczema',
    'HayFever': 'hay_fever',
    'GastroesophagealReflux': 'gastroesophageal_reflux',
    'LungFunctionFEV1': 'lung_function_fev1',
    'LungFunctionFVC': 'lung_function_fvc',
    'Wheezing': 'wheezing',
    'ShortnessOfBreath': 'shortness_of_breath',
    'ChestTightness': 'chest_tightness',
    'Coughing': 'coughing',
    'NighttimeSymptoms': 'nighttime_symptoms',
    'ExerciseInduced': 'exercise_induced',
    'Diagnosis': 'diagnosis',
    'DoctorInCharge': 'doctor_in_charge'})
patients_df.head()

Unnamed: 0,patient_id,age,gender,ethnicity,education_level,bmi,smoking,physical_activity,diet_quality,sleep_quality,...,lung_function_fev1,lung_function_fvc,wheezing,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis,doctor_in_charge
0,5034,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,...,1.369051,4.941206,0,0,1,0,0,1,0,Dr_Confid
1,5035,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,...,2.197767,1.702393,1,0,0,1,1,1,0,Dr_Confid
2,5036,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,...,1.698011,5.022553,1,1,1,0,1,1,0,Dr_Confid
3,5037,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,...,3.032037,2.300159,1,0,1,1,1,0,0,Dr_Confid
4,5038,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,...,3.470589,3.067944,1,1,1,0,0,1,0,Dr_Confid


Yep, much better! :)

### Column types

In [4]:
patients_df.dtypes

patient_id                   int64
age                          int64
gender                       int64
ethnicity                    int64
education_level              int64
bmi                        float64
smoking                      int64
physical_activity          float64
diet_quality               float64
sleep_quality              float64
pollution_exposure         float64
pollen_exposure            float64
dust_exposure              float64
pet_allergy                  int64
family_history_asthma        int64
history_of_allergies         int64
eczema                       int64
hay_fever                    int64
gastroesophageal_reflux      int64
lung_function_fev1         float64
lung_function_fvc          float64
wheezing                     int64
shortness_of_breath          int64
chest_tightness              int64
coughing                     int64
nighttime_symptoms           int64
exercise_induced             int64
diagnosis                    int64
doctor_in_charge    

* **OBSERVATION**: Column types look OK at first look (entirely **numeric** data types).

In [5]:
patients_df.doctor_in_charge.value_counts()

doctor_in_charge
Dr_Confid    2392
Name: count, dtype: int64

### Float columns
Let's check if all the `float` columns are indeed floating-point columns.

In [6]:
# Show only the float columns
float_column_names = patients_df.select_dtypes(include=['float64']).columns
patients_df.loc[:, float_column_names].head()

Unnamed: 0,bmi,physical_activity,diet_quality,sleep_quality,pollution_exposure,pollen_exposure,dust_exposure,lung_function_fev1,lung_function_fvc
0,15.848744,0.894448,5.488696,8.701003,7.388481,2.855578,0.974339,1.369051,4.941206
1,22.757042,5.897329,6.341014,5.153966,1.969838,7.457665,6.584631,2.197767,1.702393
2,18.395396,6.739367,9.196237,6.840647,1.460593,1.448189,5.445799,1.698011,5.022553
3,38.515278,1.404503,5.826532,4.253036,0.581905,7.571845,3.965316,3.032037,2.300159
4,19.283802,4.604493,3.127048,9.625799,0.980875,3.049807,8.260605,3.470589,3.067944


* **OBSERVATION**: Yep, looks fine.

* **OBSERVATION**: No NA values. That's great!

### Describe numeric values

In [7]:
patients_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
patient_id,2392.0,6229.5,690.655244,5034.0,5631.75,6229.5,6827.25,7425.0
age,2392.0,42.13796,21.606655,5.0,23.0,42.0,61.0,79.0
gender,2392.0,0.493311,0.50006,0.0,0.0,0.0,1.0,1.0
ethnicity,2392.0,0.669732,0.98612,0.0,0.0,0.0,1.0,3.0
education_level,2392.0,1.307274,0.898242,0.0,1.0,1.0,2.0,3.0
bmi,2392.0,27.244877,7.201628,15.031803,20.968313,27.052202,33.555903,39.985611
smoking,2392.0,0.141722,0.348838,0.0,0.0,0.0,0.0,1.0
physical_activity,2392.0,5.051786,2.903574,0.00174,2.578333,5.016881,7.540234,9.995809
diet_quality,2392.0,5.022867,2.90998,0.003031,2.432043,5.115383,7.544216,9.999904
sleep_quality,2392.0,7.019012,1.732475,4.001437,5.4985,6.975839,8.52695,9.996235


### Categorical data

We have a lot of boolean variables (e.g. `smoking`, `pet_allergy`, etc.) that need to be treated as categorical data.

Furthermore, we have `ethnicity` and `education_level` which are also potential categorical variables.

In [8]:
print(f"Number of unique values for `ethnicity` column: {sorted(patients_df['ethnicity'].unique().tolist())}")
print(f"Number of unique values for `education_level` column: {sorted(patients_df['education_level'].unique().tolist())}")

Number of unique values for `ethnicity` column: [0, 1, 2, 3]
Number of unique values for `education_level` column: [0, 1, 2, 3]


* **OBSERVATION**: We have 2 multi-value categorical variables (`ethnicity`, `education_level`) and 15 binary variables (`smoking`, `pet_allergy`, etc).

In [9]:
# Multi-value categorical variables
patients_df.ethnicity = patients_df.ethnicity.astype("category")
patients_df.education_level = patients_df.education_level.astype("category")
# Binary categorical variables
patients_df.gender = patients_df.gender.astype("category")
patients_df.smoking = patients_df.smoking.astype("category")
patients_df.physical_activity = patients_df.physical_activity.astype("category")
patients_df.diet_quality = patients_df.diet_quality.astype("category")
patients_df.sleep_quality = patients_df.sleep_quality.astype("category")
patients_df.pollution_exposure = patients_df.pollution_exposure.astype("category")
patients_df.pollen_exposure = patients_df.pollen_exposure.astype("category")
patients_df.dust_exposure = patients_df.dust_exposure.astype("category")
patients_df.pet_allergy = patients_df.pet_allergy.astype("category")
patients_df.family_history_asthma = patients_df.family_history_asthma.astype("category")
patients_df.history_of_allergies = patients_df.history_of_allergies.astype("category")
patients_df.eczema = patients_df.eczema.astype("category")
patients_df.hay_fever = patients_df.hay_fever.astype("category")
patients_df.gastroesophageal_reflux = patients_df.gastroesophageal_reflux.astype("category")
patients_df.wheezing = patients_df.wheezing.astype("category")
patients_df.shortness_of_breath = patients_df.shortness_of_breath.astype("category")
patients_df.chest_tightness = patients_df.chest_tightness.astype("category")
patients_df.coughing = patients_df.coughing.astype("category")
patients_df.nighttime_symptoms = patients_df.nighttime_symptoms.astype("category")
patients_df.exercise_induced = patients_df.exercise_induced.astype("category")
patients_df.diagnosis = patients_df.diagnosis.astype("category")

Let's print the final data structure (variable types).

In [10]:
patients_df.dtypes

patient_id                    int64
age                           int64
gender                     category
ethnicity                  category
education_level            category
bmi                         float64
smoking                    category
physical_activity          category
diet_quality               category
sleep_quality              category
pollution_exposure         category
pollen_exposure            category
dust_exposure              category
pet_allergy                category
family_history_asthma      category
history_of_allergies       category
eczema                     category
hay_fever                  category
gastroesophageal_reflux    category
lung_function_fev1          float64
lung_function_fvc           float64
wheezing                   category
shortness_of_breath        category
chest_tightness            category
coughing                   category
nighttime_symptoms         category
exercise_induced           category
diagnosis                  c

In [11]:
patients_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
patient_id,2392.0,6229.5,690.655244,5034.0,5631.75,6229.5,6827.25,7425.0
age,2392.0,42.13796,21.606655,5.0,23.0,42.0,61.0,79.0
bmi,2392.0,27.244877,7.201628,15.031803,20.968313,27.052202,33.555903,39.985611
lung_function_fev1,2392.0,2.548564,0.861809,1.000459,1.824113,2.553244,3.292897,3.999719
lung_function_fvc,2392.0,3.74127,1.303689,1.500045,2.607489,3.734982,4.864121,5.999421


## Persist output

We should persist the current state of the dataframe to a file on the OS.

In [12]:
to_csv(patients_df, '../data/02_data_explored.csv')