In [3]:
import pandas as pd
import seaborn as sns

In [4]:
asthma_data = pd.read_csv("../data/asthma_disease_data.csv")
asthma_data

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,...,LungFunctionFEV1,LungFunctionFVC,Wheezing,ShortnessOfBreath,ChestTightness,Coughing,NighttimeSymptoms,ExerciseInduced,Diagnosis,DoctorInCharge
0,5034,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,...,1.369051,4.941206,0,0,1,0,0,1,0,Dr_Confid
1,5035,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,...,2.197767,1.702393,1,0,0,1,1,1,0,Dr_Confid
2,5036,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,...,1.698011,5.022553,1,1,1,0,1,1,0,Dr_Confid
3,5037,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,...,3.032037,2.300159,1,0,1,1,1,0,0,Dr_Confid
4,5038,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,...,3.470589,3.067944,1,1,1,0,0,1,0,Dr_Confid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,7421,43,1,0,2,29.059613,0,3.019854,6.119637,8.300960,...,3.125249,5.166032,0,1,0,0,0,1,1,Dr_Confid
2388,7422,18,1,0,1,20.740850,0,5.805180,4.386992,7.731192,...,1.132977,5.509502,0,0,0,1,1,0,1,Dr_Confid
2389,7423,54,0,3,2,37.079560,0,4.735169,8.214064,7.483521,...,1.685962,3.346877,1,0,1,1,0,1,1,Dr_Confid
2390,7424,46,1,0,2,23.444712,0,9.672637,7.362861,6.717272,...,3.481549,1.713274,0,1,1,0,1,1,0,Dr_Confid


First of all, lets rename the columns to snake case.

In [6]:
asthma_data.columns = asthma_data.columns.str.replace('([A-Z]+)', r'_\1', regex=True).str.lower().str.strip('_')
asthma_data.columns

Index(['patient_id', 'age', 'gender', 'ethnicity', 'education_level', 'bmi',
       'smoking', 'physical_activity', 'diet_quality', 'sleep_quality',
       'pollution_exposure', 'pollen_exposure', 'dust_exposure', 'pet_allergy',
       'family_history_asthma', 'history_of_allergies', 'eczema', 'hay_fever',
       'gastroesophageal_reflux', 'lung_function_fev1', 'lung_function_fvc',
       'wheezing', 'shortness_of_breath', 'chest_tightness', 'coughing',
       'nighttime_symptoms', 'exercise_induced', 'diagnosis',
       'doctor_in_charge'],
      dtype='object')

Now lets drop the columns that give us no information or we don't care about:

In [8]:
asthma_data = asthma_data.drop(["doctor_in_charge", "patient_id"], axis = 1)

I will transform the columns that contain only boolean data to boolean type and the ethnicity and gender to categorical type and show the actual names, rather than the number, representing them. I will keep the numbers for education level because they actually matter - higher number equates higher education, so statistical measures like mean, std etc. can still be applied. I tried changing them to categorical, but then the statistical data is not displayed when using the **DataFrame.describe()** function.

In [10]:
for column in asthma_data.columns:
    # Check if all unique values in the column are either 0 or 1
    if set(asthma_data[column].unique()).issubset({0, 1}):
        # Convert column to boolean
        asthma_data[column] = asthma_data[column].astype(bool)
        
        
asthma_data.ethnicity = asthma_data.ethnicity.astype("category")
asthma_data.gender = asthma_data.gender.astype("category")

ethnicities = {0: "caucasian", 1: "african american", 2: "asian", 3: "other"}
asthma_data.ethnicity = asthma_data.ethnicity.replace(ethnicities)

genders = {0: "male", 1: "female"}
asthma_data.gender = asthma_data.gender.replace(genders)

In [11]:
asthma_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   age                      2392 non-null   int64   
 1   gender                   2392 non-null   category
 2   ethnicity                2392 non-null   category
 3   education_level          2392 non-null   int64   
 4   bmi                      2392 non-null   float64 
 5   smoking                  2392 non-null   bool    
 6   physical_activity        2392 non-null   float64 
 7   diet_quality             2392 non-null   float64 
 8   sleep_quality            2392 non-null   float64 
 9   pollution_exposure       2392 non-null   float64 
 10  pollen_exposure          2392 non-null   float64 
 11  dust_exposure            2392 non-null   float64 
 12  pet_allergy              2392 non-null   bool    
 13  family_history_asthma    2392 non-null   bool    
 14  history_

In [12]:
asthma_data.to_csv("../data/clean_asthma_data.csv")