In [None]:
import pandas as pd
import numpy as np

## Load and Combined Data

In [None]:
demographics = pd.read_csv('./processed_data/demo.csv')
labs = pd.read_csv('./processed_data/lab_raw.csv')

In [None]:
demographics.head()

In [None]:
labs.head()

In [None]:
data = demographics.merge(labs, on = 'MRN', how = 'left')
data.head()

In [None]:
print(f'There is a total of {len(data)} patients in this cohort.')

## Check Missingness

In [None]:
# no column has more than 80% missing values, so we can keep all the columns
data.isnull().sum()*100/len(data)

## Imputing Missing Values

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import MinMaxScaler

In [None]:
data = data.set_index('MRN')
data.head()

In [None]:
scaler = MinMaxScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data), columns = data.columns).set_index(data.index)
data_scaled.head()

In [None]:
# MICE Imputer Scaled
imputer = IterativeImputer()
data_iterative_imputed = pd.DataFrame(imputer.fit_transform(data_scaled),
                                      columns = data_scaled.columns).set_index(data.index)
data_iterative_imputed.head()

In [None]:
data_iterative_imputed = data_iterative_imputed.reset_index()
data_iterative_imputed.to_csv('processed_data/demo_labs_mice_imputed_scaled.csv', index=False)

In [None]:
data = data.reset_index()
data.to_csv('processed_data/demo_labs_raw_w_missing.csv', index=False)