# COVID-19 - Clinical Data to assess diagnosis

# Lesson 01 - Workflow and Data Adjustments

starting all the previous tasks did before: collecting the data, preprocessing it.

In [1]:
import pandas as pd
import numpy as np

In [2]:
url = 'https://github.com/alura-cursos/covid-19-clinical/blob/main/Kaggle_Sirio_Libanes_ICU_Prediction.xlsx?raw=true'
patients = pd.read_excel(url)

patients.head()

Unnamed: 0,PATIENT_VISIT_IDENTIFIER,AGE_ABOVE65,AGE_PERCENTIL,GENDER,DISEASE GROUPING 1,DISEASE GROUPING 2,DISEASE GROUPING 3,DISEASE GROUPING 4,DISEASE GROUPING 5,DISEASE GROUPING 6,...,TEMPERATURE_DIFF,OXYGEN_SATURATION_DIFF,BLOODPRESSURE_DIASTOLIC_DIFF_REL,BLOODPRESSURE_SISTOLIC_DIFF_REL,HEART_RATE_DIFF_REL,RESPIRATORY_RATE_DIFF_REL,TEMPERATURE_DIFF_REL,OXYGEN_SATURATION_DIFF_REL,WINDOW,ICU
0,0,1,60th,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0-2,0
1,0,1,60th,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2-4,0
2,0,1,60th,0,0.0,0.0,0.0,0.0,1.0,1.0,...,,,,,,,,,4-6,0
3,0,1,60th,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1.0,-1.0,,,,,-1.0,-1.0,6-12,0
4,0,1,60th,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-0.238095,-0.818182,-0.389967,0.407558,-0.230462,0.096774,-0.242282,-0.814433,ABOVE_12,1


In [3]:
def fill_df_continuous_vars(df: pd.DataFrame):
    
    continuous_features_cols = df.iloc[:, 13:-2].columns

    # it has a mistake: when I group and do fillna, I need to group it again to apply fillna again
    # in the way made below some data from different patients are leaking in the ffill, because it's
    # not grouped anymore.
    filled_continuous_features = (df.groupby('PATIENT_VISIT_IDENTIFIER')
                                  [continuous_features_cols]
                                  .fillna(method='bfill')
                                  .fillna(method='ffill')
                                  .reset_index())
    
    categorical_features = df.iloc[:, :13]
    
    output = df.iloc[:, -2:]
    
    return pd.concat([categorical_features, filled_continuous_features, output], axis=1)

In [4]:
patients_filled = fill_df_continuous_vars(patients)

patients_cannot_be_used = (patients_filled
                               .query("WINDOW == '0-2' & ICU == 1")
                               ['PATIENT_VISIT_IDENTIFIER']
                               .values)

patients_cleaned = patients_filled.query('PATIENT_VISIT_IDENTIFIER not in @patients_cannot_be_used')

patients_cleaned = patients_cleaned.dropna()

patients_cleaned.describe()

Unnamed: 0,PATIENT_VISIT_IDENTIFIER,AGE_ABOVE65,GENDER,DISEASE GROUPING 1,DISEASE GROUPING 2,DISEASE GROUPING 3,DISEASE GROUPING 4,DISEASE GROUPING 5,DISEASE GROUPING 6,HTN,...,RESPIRATORY_RATE_DIFF,TEMPERATURE_DIFF,OXYGEN_SATURATION_DIFF,BLOODPRESSURE_DIASTOLIC_DIFF_REL,BLOODPRESSURE_SISTOLIC_DIFF_REL,HEART_RATE_DIFF_REL,RESPIRATORY_RATE_DIFF_REL,TEMPERATURE_DIFF_REL,OXYGEN_SATURATION_DIFF_REL,ICU
count,1760.0,1760.0,1760.0,1760.0,1760.0,1760.0,1760.0,1760.0,1760.0,1760.0,...,1760.0,1760.0,1760.0,1760.0,1760.0,1760.0,1760.0,1760.0,1760.0,1760.0
mean,192.818182,0.457386,0.380682,0.1125,0.026136,0.101705,0.021591,0.132955,0.049432,0.2125,...,-0.778209,-0.823458,-0.915593,-0.828593,-0.774857,-0.85469,-0.788155,-0.824412,-0.915552,0.201705
std,110.637724,0.498322,0.485692,0.31607,0.159586,0.302345,0.145385,0.339622,0.216829,0.409193,...,0.411909,0.276279,0.247695,0.287696,0.368311,0.240338,0.384405,0.274953,0.248093,0.401387
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
25%,97.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
50%,191.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
75%,289.25,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.764706,-0.738095,-0.919192,-0.726087,-0.651942,-0.771625,-0.756272,-0.73664,-0.920103,0.0
max,384.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.333333,1.0,1.0,1.0,1.0,1.0,0.32966,1.0,1.0


In [5]:
def window_prepare(rows):
    # check if that patient went to UCI, if yes
    if(np.any(rows['ICU'])):
        # attribute 1 for ICU in the row which the window is 0-2
        rows.loc[rows.WINDOW == '0-2', 'ICU'] = 1
        
    return rows.loc[rows.WINDOW == '0-2']

In [6]:
patients_cleaned = \
    (patients_cleaned
        .groupby('PATIENT_VISIT_IDENTIFIER')
        .apply(window_prepare))

patients_cleaned.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PATIENT_VISIT_IDENTIFIER,AGE_ABOVE65,AGE_PERCENTIL,GENDER,DISEASE GROUPING 1,DISEASE GROUPING 2,DISEASE GROUPING 3,DISEASE GROUPING 4,DISEASE GROUPING 5,DISEASE GROUPING 6,...,TEMPERATURE_DIFF,OXYGEN_SATURATION_DIFF,BLOODPRESSURE_DIASTOLIC_DIFF_REL,BLOODPRESSURE_SISTOLIC_DIFF_REL,HEART_RATE_DIFF_REL,RESPIRATORY_RATE_DIFF_REL,TEMPERATURE_DIFF_REL,OXYGEN_SATURATION_DIFF_REL,WINDOW,ICU
PATIENT_VISIT_IDENTIFIER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,0,1,60th,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0-2,1
2,10,2,0,10th,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,-0.959596,-0.515528,-0.351328,-0.747001,-0.756272,-1.0,-0.961262,0-2,1
3,15,3,0,40th,1,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0-2,0
4,20,4,0,10th,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.952381,-0.979798,-1.0,-0.883669,-0.956805,-0.870968,-0.953536,-0.980333,0-2,0
5,25,5,0,10th,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.97619,-0.979798,-0.86087,-0.71446,-0.986481,-1.0,-0.975891,-0.980129,0-2,0


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

np.random.seed(73246)

In [8]:
x_columns = patients_cleaned.describe().columns

y = patients_cleaned['ICU']
x = patients_cleaned[x_columns].drop('ICU', axis=1)

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y)

In [10]:
dummy_model = DummyClassifier()
dummy_model.fit(x_train, y_train)

y_prediction_d = dummy_model.predict(x_test)
accuracy_score(y_test, y_prediction_d)



0.48863636363636365

In [11]:
LR_model = LogisticRegression(max_iter=10000)
LR_model.fit(x_train, y_train)

y_prediction = LR_model.predict(x_test)
accuracy_score(y_test, y_prediction)

0.7045454545454546

### A fail in module 5 was the use pf `.describe().columns` to filter the columns for the `x` variable. It may have disregarded some columns.

Below I check which columns were disregarded.

In [12]:
for column in patients.columns:    
    if column not in x_columns:
        print(column)

AGE_PERCENTIL
WINDOW


In [13]:
patients[['AGE_PERCENTIL', 'WINDOW']].head()

Unnamed: 0,AGE_PERCENTIL,WINDOW
0,60th,0-2
1,60th,2-4
2,60th,4-6
3,60th,6-12
4,60th,ABOVE_12


- `AGE_PERCENTIL` can be an important variable for the model training.
- On the other hand `WINDOW` is not important because we selected a specific window.

### Converting `AGE_PERCENTIL` to float or int

Logistic Regression model will not work if this variable is a string.

In [14]:
patients_cleaned.AGE_PERCENTIL

PATIENT_VISIT_IDENTIFIER      
0                         0             60th
2                         10            10th
3                         15            40th
4                         20            10th
5                         25            10th
                                     ...    
380                       1900          40th
381                       1905    Above 90th
382                       1910          50th
383                       1915          40th
384                       1920          50th
Name: AGE_PERCENTIL, Length: 352, dtype: object

In [15]:
patients_cleaned.AGE_PERCENTIL.astype('category').cat.codes

PATIENT_VISIT_IDENTIFIER      
0                         0       5
2                         10      0
3                         15      3
4                         20      0
5                         25      0
                                 ..
380                       1900    3
381                       1905    9
382                       1910    4
383                       1915    3
384                       1920    4
Length: 352, dtype: int8

In [16]:
patients_cleaned['AGE_PERCENTIL'] = \
    patients_cleaned.AGE_PERCENTIL.astype('category').cat.codes

In [17]:
patients_cleaned.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PATIENT_VISIT_IDENTIFIER,AGE_ABOVE65,AGE_PERCENTIL,GENDER,DISEASE GROUPING 1,DISEASE GROUPING 2,DISEASE GROUPING 3,DISEASE GROUPING 4,DISEASE GROUPING 5,DISEASE GROUPING 6,...,TEMPERATURE_DIFF,OXYGEN_SATURATION_DIFF,BLOODPRESSURE_DIASTOLIC_DIFF_REL,BLOODPRESSURE_SISTOLIC_DIFF_REL,HEART_RATE_DIFF_REL,RESPIRATORY_RATE_DIFF_REL,TEMPERATURE_DIFF_REL,OXYGEN_SATURATION_DIFF_REL,WINDOW,ICU
PATIENT_VISIT_IDENTIFIER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,0,1,5,0,0.0,0.0,0.0,0.0,1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0-2,1
2,10,2,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,-0.959596,-0.515528,-0.351328,-0.747001,-0.756272,-1.0,-0.961262,0-2,1
3,15,3,0,3,1,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0-2,0
4,20,4,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.952381,-0.979798,-1.0,-0.883669,-0.956805,-0.870968,-0.953536,-0.980333,0-2,0
5,25,5,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.97619,-0.979798,-0.86087,-0.71446,-0.986481,-1.0,-0.975891,-0.980129,0-2,0


### Redefining `x_columns`

In [18]:
x_columns = patients_cleaned.columns

y = patients_cleaned['ICU']
x = patients_cleaned[x_columns].drop(['ICU', 'WINDOW'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y)

### Training models again

In [19]:
dummy_model = DummyClassifier()
dummy_model.fit(x_train, y_train)

y_prediction_d = dummy_model.predict(x_test)
accuracy_score(y_test, y_prediction_d)



0.5795454545454546

In [20]:
LR_model = LogisticRegression(max_iter=10000)
LR_model.fit(x_train, y_train)

y_prediction = LR_model.predict(x_test)
accuracy_score(y_test, y_prediction)

0.6590909090909091

### Challenge 01: Convert `AGE_PERCENTIL` column into categorical data, use a different way than we already did to do that.

see scikit learn library method that can do that.

In [24]:
patients[['AGE_PERCENTIL']]

Unnamed: 0,AGE_PERCENTIL
0,60th
1,60th
2,60th
3,60th
4,60th
...,...
1920,50th
1921,50th
1922,50th
1923,50th


In [26]:
from sklearn.preprocessing import OrdinalEncoder


enc = OrdinalEncoder()
patients['AGE_PERCENTIL_ENCONDED'] = enc.fit_transform(patients[['AGE_PERCENTIL']])

patients['AGE_PERCENTIL_ENCONDED']

0       5.0
1       5.0
2       5.0
3       5.0
4       5.0
       ... 
1920    4.0
1921    4.0
1922    4.0
1923    4.0
1924    4.0
Name: AGE_PERCENTIL_ENCONDED, Length: 1925, dtype: float64

In [27]:
patients['AGE_PERCENTIL_ENCONDED'].unique()

array([5., 8., 0., 3., 6., 1., 4., 7., 2., 9.])