In [275]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import seaborn as sns

In [276]:
df_admissions = pd.read_csv('Admissions.csv')
df_lab = pd.read_csv('lab.csv')
df_transfusions = pd.read_csv('transfusions.csv')

In [277]:
df_lab.head()

Unnamed: 0,admission_id,test_name,test_code,result_unit,result_value,result_date,result_time
0,4416,Albumin,ALB,g/dL,4.1,2003-08-19,02:25
1,4636,Platelet Count,PLT,E9/L,295.0,2012-02-05,01:42
2,8041,Hemoglobin,RBC,g/L,165.4,2004-04-18,08:05
3,7384,Chloride plasma,CLPL,mmol/L,193.0,2000-04-04,14:55
4,2604,Albumin,ALB,g/dL,6.0,2008-02-01,11:30


In [278]:
# first left join on admissions
left = df_admissions
right = df_lab
df_temp = pd.merge(left, right, how='left', on=['admission_id'])

In [279]:
# final left join on admissions
left = df_temp
right = df_transfusions
df_all = pd.merge(left, right, how='left', on=['admission_id'])

In [280]:
df_all.columns

Index(['admission_id', 'patient_id', 'admission_date', 'admission_time',
       'discharge_date', 'discharge_time', 'hospital', 'age', 'sex',
       'charlson_comorbidity_index', 'lap_score', 'test_name', 'test_code',
       'result_unit', 'result_value', 'result_date', 'result_time',
       'issue_date', 'issue_time', 'rbc_transfusion', 'platelet_transfusion',
       'plasma_transfusion'],
      dtype='object')

In [281]:
len(df_all)

5231

In [282]:
most_fequent_age = df_all['age'].value_counts().index.tolist()[0]
least_fequent_age = df_all['age'].value_counts().index.tolist()[-1]
age_median = df_all['age'].median()
age_mean = df_all['age'].mean()
most_fequent_charlson = df_all['charlson_comorbidity_index'].value_counts().index.tolist()[0]
least_fequent_charlson = df_all['charlson_comorbidity_index'].value_counts().index.tolist()[-1]


In [283]:
age_median

55.0

In [284]:
# imputation for age
for index, row in df_all.iterrows():
    if pd.isna(df_all.iloc[index, 7]):
        df_all.iloc[index, 7] = age_median

# imputation for charlson index
for index, row in df_all.iterrows():
    if pd.isna(df_all.iloc[index, 9]):
        df_all.iloc[index, 9] = most_fequent_charlson

In [285]:
# converting sex to 0 and 1
for index, row in df_all.iterrows():
    if df_all.iloc[index, 8] == "M":
        df_all.iloc[index, 8] = 0
    else:
        df_all.iloc[index, 8] = 1

In [286]:
df_all['rbc_transfusion'].head()

0      NaN
1      NaN
2    False
3      NaN
4      NaN
Name: rbc_transfusion, dtype: object

In [287]:
# separate those who have done rbc transfusion and those who have not
rbc_trans_true = df_all[df_all['rbc_transfusion'] == True]
rbc_trans_false = df_all[df_all['rbc_transfusion'] == False]

In [288]:
# t-test for age
ttest_ind(rbc_trans_false['age'], rbc_trans_true['age'])

Ttest_indResult(statistic=-7.610584618334025, pvalue=9.539871889987221e-14)

In [289]:

# t-test for sex
ttest_ind(rbc_trans_false['sex'], rbc_trans_true['sex'])

Ttest_indResult(statistic=1.1686222184905088, pvalue=0.24298101758275373)

In [290]:
df_all.columns

Index(['admission_id', 'patient_id', 'admission_date', 'admission_time',
       'discharge_date', 'discharge_time', 'hospital', 'age', 'sex',
       'charlson_comorbidity_index', 'lap_score', 'test_name', 'test_code',
       'result_unit', 'result_value', 'result_date', 'result_time',
       'issue_date', 'issue_time', 'rbc_transfusion', 'platelet_transfusion',
       'plasma_transfusion'],
      dtype='object')

In [291]:
# converting categorical variables, "hospital" and "sex", to dummy variables
df_all = pd.concat([df_all, pd.get_dummies(df_all['hospital'], prefix='hosp', drop_first=True)], axis=1)
df_all = pd.concat([df_all, pd.get_dummies(df_all['sex'], prefix='sex', drop_first=True)], axis=1)

In [292]:
df_all_no_null = df_all.dropna()

In [293]:
df_all_no_null['result_value']

2         1.8
23      100.8
32      159.6
52      102.2
80      131.3
        ...  
5177      1.4
5181      1.1
5187    169.1
5189    161.9
5191      1.8
Name: result_value, Length: 415, dtype: float64

In [294]:
df_all_no_null.columns

Index(['admission_id', 'patient_id', 'admission_date', 'admission_time',
       'discharge_date', 'discharge_time', 'hospital', 'age', 'sex',
       'charlson_comorbidity_index', 'lap_score', 'test_name', 'test_code',
       'result_unit', 'result_value', 'result_date', 'result_time',
       'issue_date', 'issue_time', 'rbc_transfusion', 'platelet_transfusion',
       'plasma_transfusion', 'hosp_St. Joseph's Health Centre',
       'hosp_St. Michael's Hospital', 'hosp_Sunnybrook Health Sciences Centre',
       'hosp_Toronto Western Hospital', 'sex_1'],
      dtype='object')

In [295]:
X = df_all_no_null[["age", "sex_1", 
                    "hosp_St. Joseph's Health Centre",
                    "hosp_St. Michael's Hospital", "hosp_Sunnybrook Health Sciences Centre",
                    "hosp_Toronto Western Hospital"]]
y = df_all_no_null['result_value']

In [298]:
X_scaled = X.copy()
scaler_x = StandardScaler().fit(X)
X_scaled = scaler_x.transform(X)

In [299]:
y = np.array(y).reshape(-1, 1)
scaler_y = StandardScaler().fit(y)
y_scaled = scaler_y.transform(y)

In [300]:
# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [301]:
# fit the regression model to the data
lm = LinearRegression().fit(X_train, y_train)

In [302]:
y_test_arr = list(np.array(y_test))
len(y_test_arr)

125

In [303]:
y_pred = lm.predict(X_test)

In [304]:
r2_score(y_test, y_pred)

0.16258130518279335

In [305]:
df_all.columns

Index(['admission_id', 'patient_id', 'admission_date', 'admission_time',
       'discharge_date', 'discharge_time', 'hospital', 'age', 'sex',
       'charlson_comorbidity_index', 'lap_score', 'test_name', 'test_code',
       'result_unit', 'result_value', 'result_date', 'result_time',
       'issue_date', 'issue_time', 'rbc_transfusion', 'platelet_transfusion',
       'plasma_transfusion', 'hosp_St. Joseph's Health Centre',
       'hosp_St. Michael's Hospital', 'hosp_Sunnybrook Health Sciences Centre',
       'hosp_Toronto Western Hospital', 'sex_1'],
      dtype='object')

In [306]:
df_all.dtypes

admission_id                                int64
patient_id                                  int64
admission_date                             object
admission_time                             object
discharge_date                             object
discharge_time                             object
hospital                                   object
age                                       float64
sex                                        object
charlson_comorbidity_index                 object
lap_score                                   int64
test_name                                  object
test_code                                  object
result_unit                                object
result_value                              float64
result_date                                object
result_time                                object
issue_date                                 object
issue_time                                 object
rbc_transfusion                            object


In [310]:
df_all_no_null['admission_date'].head()

2     2002-08-24
23    2002-07-04
32    2010-09-27
52    2010-11-05
80    2011-04-19
Name: admission_date, dtype: object

In [309]:
df_all_no_null['admission_time'].head()

2     14:28
23    08:11
32    10:37
52    15:31
80    11:11
Name: admission_time, dtype: object

In [311]:
df_all_no_null['admission_dt'] = df_all_no_null['admission_date'] + " " + df_all_no_null['admission_time']
df_all_no_null['discharge_dt'] = df_all_no_null['discharge_date'] + " " + df_all_no_null['discharge_time']

In [312]:
df_all_no_null['admission_dt'].head()

2     2002-08-24 14:28
23    2002-07-04 08:11
32    2010-09-27 10:37
52    2010-11-05 15:31
80    2011-04-19 11:11
Name: admission_dt, dtype: object

In [313]:
df_all_no_null['admission_dt'] = pd.to_datetime(df_all_no_null['admission_dt'])
df_all_no_null['discharge_dt'] = pd.to_datetime(df_all_no_null['discharge_dt'])

In [315]:
df_all_no_null['admission_dt'].head()

2    2002-08-24 14:28:00
23   2002-07-04 08:11:00
32   2010-09-27 10:37:00
52   2010-11-05 15:31:00
80   2011-04-19 11:11:00
Name: admission_dt, dtype: datetime64[ns]

In [None]:
df_all_no_null['length_of_stay'] = pd.Timedelta(df_all_no_null['discharge_dt'] - df_all_no_null['admission_dt']).hours()

In [None]:
sns.scatterplot(x=df_all_no_null['length_of_stay'], y=df_all_no_null['charlston_comorbidity_index'])

In [None]:
sns.scatterplot(data=df_all_no_null, x='length_of_stay', y=d'charlston_comorbidity_index', hue="ages", style="age")