In [1]:
# Load local libraries
import __load_libs
from src.df_io import from_csv, to_csv

# Problem 3. Data cleaning and preprocessing


In [2]:
patients_df = from_csv('../data/02_data_explored.csv')
patients_df.head()

Unnamed: 0,patient_id,age,gender,ethnicity,education_level,bmi,smoking,physical_activity,diet_quality,sleep_quality,...,lung_function_fev1,lung_function_fvc,wheezing,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis,doctor_in_charge
0,5034,63,0,1,0,15.848744,0,0.8944483090233335,5.488695584993768,8.701002733591553,...,1.369051,4.941206,0,0,1,0,0,1,0,Dr_Confid
1,5035,26,1,2,2,22.757042,0,5.897329493528446,6.341014020966575,5.153966369546168,...,2.197767,1.702393,1,0,0,1,1,1,0,Dr_Confid
2,5036,57,0,2,1,18.395396,0,6.739367010951074,9.196237204622909,6.840646602827763,...,1.698011,5.022553,1,1,1,0,1,1,0,Dr_Confid
3,5037,40,1,2,1,38.515278,0,1.4045026784207648,5.826531797560653,4.25303551230768,...,3.032037,2.300159,1,0,1,1,1,0,0,Dr_Confid
4,5038,61,0,0,3,19.283802,0,4.6044926148128855,3.127048193671432,9.625799205296698,...,3.470589,3.067944,1,1,1,0,0,1,0,Dr_Confid


## NA values

Let's check for NA values.

In [3]:
patients_df.isna().sum()

patient_id                 0
age                        0
gender                     0
ethnicity                  0
education_level            0
bmi                        0
smoking                    0
physical_activity          0
diet_quality               0
sleep_quality              0
pollution_exposure         0
pollen_exposure            0
dust_exposure              0
pet_allergy                0
family_history_asthma      0
history_of_allergies       0
eczema                     0
hay_fever                  0
gastroesophageal_reflux    0
lung_function_fev1         0
lung_function_fvc          0
wheezing                   0
shortness_of_breath        0
chest_tightness            0
coughing                   0
nighttime_symptoms         0
exercise_induced           0
diagnosis                  0
doctor_in_charge           0
dtype: int64

* **OBSERVATION**: No NA values observed in the dataset. That's great!

## Duplicated values
Let's check for duplicated rows.

In [4]:
patients_df.duplicated().sum()

np.int64(0)

* **OBSERVATION**: No duplicates. That's good.

## Drop `doctor_in_charge`

* **OBSERVATION**: Column `doctor_in_charge` does not look to bring any value. It has a **single** value. Furthermore, that value is `Dr_Confid` (i.e. anonymized). 

Let's drop it.

In [5]:
patients_df = patients_df.drop(columns=['doctor_in_charge'])
patients_df.head()

Unnamed: 0,patient_id,age,gender,ethnicity,education_level,bmi,smoking,physical_activity,diet_quality,sleep_quality,...,gastroesophageal_reflux,lung_function_fev1,lung_function_fvc,wheezing,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis
0,5034,63,0,1,0,15.848744,0,0.8944483090233335,5.488695584993768,8.701002733591553,...,0,1.369051,4.941206,0,0,1,0,0,1,0
1,5035,26,1,2,2,22.757042,0,5.897329493528446,6.341014020966575,5.153966369546168,...,0,2.197767,1.702393,1,0,0,1,1,1,0
2,5036,57,0,2,1,18.395396,0,6.739367010951074,9.196237204622909,6.840646602827763,...,0,1.698011,5.022553,1,1,1,0,1,1,0
3,5037,40,1,2,1,38.515278,0,1.4045026784207648,5.826531797560653,4.25303551230768,...,0,3.032037,2.300159,1,0,1,1,1,0,0
4,5038,61,0,0,3,19.283802,0,4.6044926148128855,3.127048193671432,9.625799205296698,...,0,3.470589,3.067944,1,1,1,0,0,1,0


## Drop `patient_id`

In [6]:
patients_df.patient_id.nunique()

2392

* **OBSERVATION**: `patient_id` identifier covers all values between $5034$ and $7425$. Hmm, what about values below $5034$? Or above $7425$?
* **ACTION**: I tried to find related data (e.g. more patients). Unfortunately, I wasn't able to find such.

So, I see no point in keeping this identifier further. It will not be needed for training a prediction model.

In [7]:
patients_df = patients_df.drop(columns=['patient_id'])
patients_df.head()

Unnamed: 0,age,gender,ethnicity,education_level,bmi,smoking,physical_activity,diet_quality,sleep_quality,pollution_exposure,...,gastroesophageal_reflux,lung_function_fev1,lung_function_fvc,wheezing,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis
0,63,0,1,0,15.848744,0,0.8944483090233335,5.488695584993768,8.701002733591553,7.388480566727442,...,0,1.369051,4.941206,0,0,1,0,0,1,0
1,26,1,2,2,22.757042,0,5.897329493528446,6.341014020966575,5.153966369546168,1.9698383357954967,...,0,2.197767,1.702393,1,0,0,1,1,1,0
2,57,0,2,1,18.395396,0,6.739367010951074,9.196237204622909,6.840646602827763,1.4605929608570043,...,0,1.698011,5.022553,1,1,1,0,1,1,0
3,40,1,2,1,38.515278,0,1.4045026784207648,5.826531797560653,4.25303551230768,0.5819053321460788,...,0,3.032037,2.300159,1,0,1,1,1,0,0
4,61,0,0,3,19.283802,0,4.6044926148128855,3.127048193671432,9.625799205296698,0.9808745687552378,...,0,3.470589,3.067944,1,1,1,0,0,1,0


In [8]:
patients_df.shape

(2392, 27)

In [9]:
patients_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   age                      2392 non-null   int64   
 1   gender                   2392 non-null   category
 2   ethnicity                2392 non-null   category
 3   education_level          2392 non-null   category
 4   bmi                      2392 non-null   float64 
 5   smoking                  2392 non-null   category
 6   physical_activity        2392 non-null   category
 7   diet_quality             2392 non-null   category
 8   sleep_quality            2392 non-null   category
 9   pollution_exposure       2392 non-null   category
 10  pollen_exposure          2392 non-null   category
 11  dust_exposure            2392 non-null   category
 12  pet_allergy              2392 non-null   category
 13  family_history_asthma    2392 non-null   category
 14  history_

## Persist output

We should persist the current state of the dataframe to a file on the OS.

In [10]:
to_csv(patients_df, '../data/03_data_cleaned.csv')