In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingRegressor
import pickle

from los_preprocess import los_preprocess
from redshift_read_table import read_table

In [2]:
sample_set = pd.read_csv("/Users/abhishek/Documents/GitHub/Factspan/factihealth-data-science/Abhishek/LOS/app_hadm_ids.csv")
hadm_id = tuple(sample_set['hadm_id'].unique())
subject_id = tuple(sample_set['subject_id'].unique())

In [3]:
admission_query = """
SELECT *
FROM mimic.admissions
WHERE hadm_id in {}
""".format(hadm_id)

patient_query = """
SELECT *
FROM mimic.patients
WHERE subject_id in {}
""".format(subject_id)

diagnoses_icd_query = """
SELECT *
FROM mimic.diagnosis_icd
WHERE hadm_id in {}
""".format(hadm_id)

icustays_query = """
SELECT *
FROM mimic.icustays
WHERE hadm_id in {}
""".format(hadm_id)

In [4]:
# Primary Admissions information
df = read_table(admission_query)

# Patient specific info such as gender
df_pat = read_table(patient_query)

# Diagnosis for each admission to hospital
df_diagcode = read_table(diagnoses_icd_query)

# Intensive Care Unit (ICU) for each admission to hospital
df_icu = read_table(icustays_query)

In [5]:
df_hadm_id, df_clean, actual_median_los, actual_mean_los = los_preprocess(df=df, 
                                                              df_pat=df_pat, 
                                                              df_diagcode=df_diagcode, 
                                                              df_icu=df_icu)

(1/5) Completed dataframe imports
(2/5) Completed ADMISSIONS df cleanup and feature engineering.
(3/5) Completed DIAGNOSES_ICD df cleanup and feature engineering.
(4/5) Completed PATIENT df cleanup and feature engineering.
(5/5) Completed ICUSTAYS.csv cleanup and feature engineering.
Data Preprocessing complete.
98
98


In [6]:
df_clean

Unnamed: 0,los,blood,circulatory,congenital,digestive,endocrine,genitourinary,infectious,injury,mental,...,RACE_WHITE,AGE_middle_adult,AGE_senior,AGE_young_adult,MAR_DIVORCED,MAR_MARRIED,MAR_SINGLE,MAR_WIDOWED,Neuro Surgical Intensive Care Unit (Neuro SICU),MAR_UNKNOWN (DEFAULT)
0,3.420139,0,2,0,1,3,0,0,2,0,...,0,0,1,0,0,1,0,0,0,0
2,4.336806,1,0,0,1,2,0,0,0,0,...,1,0,1,0,0,1,0,0,0,0
4,0.944444,0,0,0,1,0,0,0,2,0,...,1,0,0,1,0,0,1,0,0,0
6,6.252083,2,2,0,1,4,0,0,8,0,...,0,0,1,0,0,1,0,0,0,0
8,3.986111,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,0.848611,0,3,0,0,1,1,0,3,0,...,1,1,0,0,0,1,0,0,0,0
192,15.322917,0,2,0,1,0,0,0,0,1,...,1,1,0,0,0,1,0,0,0,0
194,5.944444,1,0,0,0,2,1,0,2,0,...,1,1,0,0,0,1,0,0,0,0
196,4.004861,1,2,0,0,3,0,1,3,0,...,0,0,1,0,0,0,1,0,0,0


In [7]:
df_hadm_id

0      26238645
2      26342274
4      28832353
6      23248569
8      21801660
         ...   
190    26654047
192    22826818
194    29252948
196    25042148
200    28400535
Name: hadm_id, Length: 98, dtype: int64

In [8]:
# Replace 'path_to_your_pickle_file.pkl' with the actual path to your pickle file
with open('/Users/abhishek/Documents/GitHub/Factspan/factihealth-data-science/Abhishek/LOS/los_model_xgboost.pkl', 'rb') as file:
    model = pickle.load(file)


In [9]:
df_input = df_clean[['blood', 'circulatory', 'congenital', 'digestive', 'endocrine',
       'genitourinary', 'infectious', 'injury', 'mental', 'misc', 'muscular',
       'neoplasms', 'nervous', 'pregnancy', 'prenatal', 'respiratory', 'skin',
       'gender', 'Neuro Intermediate',
       'Neuro Surgical Intensive Care Unit (Neuro SICU)', 'Other-ICU',
       'ADM_ELECTIVE', 'ADM_EMERGENCY', 'ADM_OBSERVATION',
       'ADM_SURGICAL SAME DAY ADMISSION', 'INS_Medicaid', 'INS_Medicare',
       'INS_Other', 'RACE_ASIAN', 'RACE_BLACK/AFRICAN AMERICAN',
       'RACE_HISPANIC/LATINO', 'RACE_OTHER/UNKNOWN', 'RACE_WHITE',
       'AGE_middle_adult', 'AGE_senior', 'AGE_young_adult', 'MAR_DIVORCED',
       'MAR_MARRIED', 'MAR_SINGLE', 'MAR_UNKNOWN (DEFAULT)', 'MAR_WIDOWED']]

In [10]:
len(df_input)

98

In [11]:
predictions = model.predict(df_input)


In [12]:
print(type(df_hadm_id))
print(type(predictions))

print(len(df_hadm_id))
print(len(predictions))

<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>
98
98


In [13]:
data = {
    'hadm_id': df_hadm_id,
    'predicted_los': predictions
}

In [14]:
los_predictions_df = pd.DataFrame(data)


In [15]:
los_predictions_df['predicted_los'] = los_predictions_df['predicted_los'].round(0)

In [16]:
los_predictions_df.to_csv("los_predictions_df.csv", index=False)

In [17]:
los_predictions_df.drop_duplicates()

Unnamed: 0,hadm_id,predicted_los
0,26238645,3.0
2,26342274,5.0
4,28832353,1.0
6,23248569,15.0
8,21801660,3.0
...,...,...
190,26654047,3.0
192,22826818,3.0
194,29252948,3.0
196,25042148,5.0
