# Imports


In [1]:
import os
import pandas as pd
import pickle
import time
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import psycopg2
import time
import shap
import warnings
pd.set_option('display.max_columns', None)
# Suppress Matplotlib warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
demo_data = pd.read_csv('Data/Admissions_Data.csv')
hadm_id_list = tuple(demo_data['hadm_id'].unique())
len(hadm_id_list)

150

In [4]:
demo_data.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,transtime,admission_type,admission_location,discharge_location,marital_status,race
0,12202842,26238645,2/2/2024,2/5/2024,,SURGICAL SAME DAY ADMISSION,PHYSICIAN REFERRAL,SKILLED NURSING FACILITY,MARRIED,OTHER
1,13700862,26342274,2/2/2024,2/6/2024,,ELECTIVE,PHYSICIAN REFERRAL,HOME HEALTH CARE,MARRIED,WHITE
2,12827707,28832353,2/2/2024,2/3/2024,,DIRECT OBSERVATION,PHYSICIAN REFERRAL,,SINGLE,WHITE
3,13662342,23248569,2/2/2024,2/8/2024,,ELECTIVE,PHYSICIAN REFERRAL,HOME,MARRIED,ASIAN - CHINESE
4,17467543,20514821,2/3/2024,2/7/2024,,SURGICAL SAME DAY ADMISSION,PHYSICIAN REFERRAL,HOME HEALTH CARE,SINGLE,ASIAN - CHINESE


In [5]:
def db_connection(query):
    # Database connection parameters
    db_name = 'factihealth'   # Database name
    db_user = 'fh_user'  # Username
    db_password = 'Facti@874'  # Password
    db_host = 'redshift-cluster-factihealth.cuzgotkwtow6.ap-south-1.redshift.amazonaws.com'  # Cluster endpoint
    db_port = 5439  # Port
    # Connect to the database
    try:
        conn = psycopg2.connect(
            dbname=db_name,
            user=db_user,
            password=db_password,
            host=db_host,
            port=db_port
        )
        print("Connected to the database successfully")
        # Create a cursor object
        cur = conn.cursor()
        # Execute a query
        cur.execute(query)
        rows = cur.fetchall()
        
        # Get column names from the cursor description
        col_names = [desc[0] for desc in cur.description]
        
        # Create a DataFrame from the fetched rows and column names
        df = pd.DataFrame(rows, columns=col_names)
        # Close the cursor and connection
        cur.close()
        conn.close()
        
        return df
    except Exception as e:
        print(f"Database connection failed due to {e}")
        
# Get Table Data based on table name
def get_db_data(table_name, patients_ids=()):
    ''' Get Data based on the table name provided '''
    # query = f"""SELECT * FROM factihealth.mimic.{table_name} 
    #             WHERE subject_id in {patients_ids}
    #             """
    query = f"SELECT * FROM factihealth.mimic.{table_name}"
    if patients_ids:
        query += f" WHERE hadm_id IN {patients_ids}"
#     print(query)
    df = db_connection(query)
    return df

In [6]:
admissions_df = get_db_data('admissions', hadm_id_list)
admissions_df['date'] = admissions_df['admittime'].dt.date
# admissions_df = admissions_df[admissions_df['hadm_id'].isin(hadm_id_list)]
admissions_df.shape

Connected to the database successfully


(300, 17)

In [7]:
diagnosis_icd_df = get_db_data('diagnosis_icd', hadm_id_list)
diagnosis_icd_df.shape

Connected to the database successfully


(1530, 5)

In [8]:
icustays_df = get_db_data('icustays', hadm_id_list)
icustays_df.shape

Connected to the database successfully


(29, 8)

In [9]:
chartevents_df = get_db_data('chartevents', hadm_id_list)
chartevents_df.shape

Connected to the database successfully


(98641, 11)

In [10]:
patients_df = get_db_data('patients')
patients_df.shape

Connected to the database successfully


(299712, 6)

In [11]:
def load_data():
    ## Admissions
    admissions_df = get_db_data('admissions', hadm_id_list)
    admissions_df['date'] = admissions_df['admittime'].dt.date
    
    ## Diagnosis ICD
    diagnosis_icd_df = get_db_data('diagnosis_icd', hadm_id_list)
    
    ## ICU Stays
    icustays_df = get_db_data('icustays', hadm_id_list)
    ## Chart Events
    chartevents_df = get_db_data('chartevents', hadm_id_list)
    ## Patient Data
    patients_df = get_db_data('patients')
    
    return admissions_df, diagnosis_icd_df, icustays_df, chartevents_df, patients_df
    

In [12]:
admissions_df, diagnosis_icd_df, icustays_df, chartevents_df, patients_df = load_data()

Connected to the database successfully
Connected to the database successfully
Connected to the database successfully
Connected to the database successfully
Connected to the database successfully


In [13]:
admissions_df.shape, diagnosis_icd_df.shape, icustays_df.shape, chartevents_df.shape, patients_df.shape

((300, 17), (1530, 5), (29, 8), (98641, 11), (299712, 6))

In [14]:


def categorize_race(admin_diag):
    ## Recategorize Race
    admin_diag['race'] = np.where(admin_diag['race'].isin(['AMERICAN INDIAN/ALASKA NATIVE']), 'AMERICAN INDIAN/ALASKA NATIVE',
                            np.where(admin_diag['race'].isin(['ASIAN', 'ASIAN - ASIAN INDIAN', 'ASIAN - CHINESE',
                                                                           'ASIAN - KOREAN', 'ASIAN - SOUTH EAST ASIAN']),
                                     'ASIAN',
                            np.where(admin_diag['race'].isin(['BLACK/AFRICAN', 'BLACK/AFRICAN AMERICAN', 
                                                                           'BLACK/CAPE VERDEAN', 'BLACK/CARIBBEAN ISLAND']),
                                     'BLACK/AFRICAN',
                            np.where(admin_diag['race'].isin(['HISPANIC OR LATINO', 'HISPANIC/LATINO - CENTRAL AMERICAN',
                                                                           'HISPANIC/LATINO - COLUMBIAN', 'HISPANIC/LATINO - CUBAN',
                                                                           'HISPANIC/LATINO - DOMINICAN', 'HISPANIC/LATINO - GUATEMALAN', 
                                                                           'HISPANIC/LATINO - HONDURAN', 'HISPANIC/LATINO - MEXICAN', 
                                                                           'HISPANIC/LATINO - PUERTO RICAN','HISPANIC/LATINO - SALVADORAN']), 
                                     'HISPANIC OR LATINO',
                            np.where(admin_diag['race'].isin(['NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER']), 
                                     'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER',
                            np.where(admin_diag['race'].isin(['MULTIPLE RACE/race', 'OTHER', 
                                                                      'PATIENT DECLINED TO ANSWER', 'UNABLE TO OBTAIN', 'UNKNOWN']), 
                                     'OTHER',
                            np.where(admin_diag['race'].isin(['PORTUGUESE']), 'PORTUGUESE',
                            np.where(admin_diag['race'].isin(['SOUTH AMERICAN']), 'SOUTH AMERICAN',
                            np.where(admin_diag['race'].isin(['WHITE', 'WHITE - BRAZILIAN', 'WHITE - EASTERN EUROPEAN',
                                                                           'WHITE - OTHER EUROPEAN', 'WHITE - RUSSIAN']), 
                                     'WHITE',
                                     np.nan)))))))))
    

    return admin_diag



# Function to categorize ICD codes
def categorize_icd(icd):
    try:
        if 1 <= int(icd) <= 139:
            return 'Infectious and Parasitic Diseases'
        elif 140 <= int(icd) <= 239:
            return 'Neoplasms'
        elif 240 <= int(icd) <= 279:
            return 'Endocrine, Nutritional, and Metabolic Diseases'
        elif 280 <= int(icd) <= 289:
            return 'Diseases of the Blood and Blood-forming Organs'
        elif 290 <= int(icd) <= 319:
            return 'Mental Disorders'
        elif 320 <= int(icd) <= 389:
            return 'Diseases of the Nervous System and Sense Organs'
        elif 390 <= int(icd) <= 459:
            return 'Diseases of the Circulatory System'
        elif 460 <= int(icd) <= 519:
            return 'Diseases of the Respiratory System'
        elif 520 <= int(icd) <= 579:
            return 'Diseases of the Digestive System'
        elif 580 <= int(icd) <= 629:
            return 'Diseases of the Genitourinary System'
        elif 630 <= int(icd) <= 679:
            return 'Complications of Pregnancy, Childbirth, and the Puerperium'
        elif 680 <= int(icd) <= 709:
            return 'Diseases of the Skin and Subcutaneous Tissue'
        elif 710 <= int(icd) <= 739:
            return 'Diseases of the Musculoskeletal System and Connective Tissue'
        elif 740 <= int(icd) <= 759:
            return 'Congenital Anomalies'
        elif 760 <= int(icd) <= 779:
            return 'Certain Conditions Originating in the Perinatal Period'
        elif 780 <= int(icd) <= 799:
            return 'Symptoms, Signs, and Ill-defined Conditions'
        elif 800 <= int(icd) <= 999:
            return 'Injury and Poisoning'
        
    except ValueError:
        if icd[0] == 'A' or icd[0] == 'B':
            return 'Infectious and Parasitic Diseases'
        elif icd[0] == 'C':
            return 'Neoplasms'
        elif icd[0] == 'D':
            return 'Diseases of the Blood and Blood-forming Organs'
        elif icd[0] == 'E':
            return 'External causes of injury and supplemental classification'
        elif icd[0] == 'F':
            return 'Mental Disorders'
        elif icd[0] == 'G':
            return 'Diseases of the Nervous System and Sense Organs'
        elif icd[0] == 'H':
            return 'Diseases of the eye, adnexa and mastoid process'
        elif icd[0] == 'I':
            return 'Diseases of the Circulatory System'
        elif icd[0] == 'J':
            return 'Diseases of the Respiratory System'
        elif icd[0] == 'K':
            return 'Diseases of the Digestive System'
        elif icd[0] == 'L':
            return 'Diseases of the Skin and Subcutaneous Tissue'
        elif icd[0] == 'M':
            return 'Diseases of the Musculoskeletal System and Connective Tissue'
        elif icd[0] == 'N':
            return 'Diseases of the Genitourinary System'
        elif icd[0] == 'O':
            return 'Complications of Pregnancy, Childbirth, and the Puerperium'
        elif icd[0] == 'P':
            return 'Certain Conditions Originating in the Perinatal Period'
        elif icd[0] == 'Q':
            return 'Congenital Anomalies'
        elif icd[0] == 'R':
            return 'Symptoms, Signs, and Ill-defined Conditions'
        elif icd[0] == 'S' or icd[0] == 'T' :
            return 'Injury and Poisoning'
        elif icd[0] == 'U':
            return 'Codes for special purposes'
        elif icd[0] in ['V', 'W', 'X' ,'Y']:
            return 'External causes of injury and supplemental classification'
        elif icd[0] == 'Z':
            return 'Factors influencing health status and contact with health services'
  


In [15]:
diagnosis_icd_df['icd'] = diagnosis_icd_df.icd_code.str[:3]
diagnosis_icd_df['diagnosis'] = diagnosis_icd_df['icd'].apply(categorize_icd)
diagnosis_icd_df = diagnosis_icd_df[['subject_id', 'hadm_id', 'diagnosis']]

In [16]:
## Merge Diagnosis Data with Diagnosis Code(ICD)
patients_df = patients_df[['subject_id', 'gender', 'anchor_age']]
pat_diagn_data = pd.merge(diagnosis_icd_df, patients_df, on='subject_id', how='left')
pat_diagn_data.drop_duplicates(inplace=True)

In [17]:
admin_diag = pd.merge(admissions_df, pat_diagn_data, on=['subject_id','hadm_id'], 
                          how='left')

## Select the required Columns
admin_diag = admin_diag[['subject_id', 'hadm_id', 'admittime','admission_type',
                         'admission_location','discharge_location', 'insurance', 
                         'language', 'marital_status', 'race',
                         'gender', 'anchor_age', 'diagnosis']]

In [18]:
## Convert Time to Date
admin_diag['admittime'] = pd.to_datetime(admin_diag['admittime'])
admin_diag['admitdate'] = admin_diag['admittime'].dt.date

# Sort Values on the admin time
admin_diag.sort_values(['subject_id', 'admittime'], inplace=True)

## Get the time difference
admin_diag['admitdate_diff'] = admin_diag.groupby('subject_id')['admittime'].diff()

# Replace 0s with NaN
admin_diag['admitdate_diff'] = admin_diag['admitdate_diff'].replace(pd.Timedelta(0), np.nan)

# Forward fill NaN values
admin_diag['admitdate_diff'] = admin_diag['admitdate_diff'].fillna(method='ffill')


In [19]:
# Define boolean conditions for each range
admin_diag['<30'] = admin_diag['admitdate_diff'] <= pd.Timedelta(days=30)
admin_diag['<60'] = (admin_diag['admitdate_diff'] <= pd.Timedelta(days=60))
admin_diag['>365'] = admin_diag['admitdate_diff'] > pd.Timedelta(days=365)

# Convert boolean values to integers (0 and 1)
admin_diag['<30'] = admin_diag['<30'].astype(int)
admin_diag['<60'] = admin_diag['<60'].astype(int)
admin_diag['>365'] = admin_diag['>365'].astype(int)
## Convert to Days
admin_diag['admitdate_diff'] = admin_diag['admitdate_diff'].dt.days

In [20]:
## Recategorize Admission Type
conditions = admin_diag['admission_type'].isin(['EW EMER.', 'URGENT', 'DIRECT EMER.'])
admin_diag['admission_type'] = np.where(conditions, 'Emergency', admin_diag['admission_type'])

conditions = admin_diag['admission_type'].isin(['OBSERVATION ADMIT', 'EU OBSERVATION', 'DIRECT OBSERVATION', 'AMBULATORY OBSERVATION'])
admin_diag['admission_type'] = np.where(conditions, 'Observation', admin_diag['admission_type'])

conditions = admin_diag['admission_type'].isin(['SURGICAL SAME DAY ADMISSION'])
admin_diag['admission_type'] = np.where(conditions, 'Surgical Same Day Admission', admin_diag['admission_type'])

conditions = admin_diag['admission_type'].isin(['ELECTIVE'])
admin_diag['admission_type'] = np.where(conditions, 'Elective', admin_diag['admission_type'])

# =============================================================================
#     CATEGPROIZE RACES
# =============================================================================
admin_diag = categorize_race(admin_diag)

In [21]:
# =============================================================================
#     Merge Admission Data with Chart events
# =============================================================================
master_merged = pd.merge(admin_diag, chartevents_df, on=['hadm_id','subject_id'], how='left')
master_merged = master_merged.fillna(0)

## Select the required columns
columns= ['subject_id', 'hadm_id', 'admittime', 'admission_type', 'admission_location',
          'insurance', 'language', 'marital_status', 'race','diagnosis', 'gender',
          'anchor_age','admitdate', 'admitdate_diff','valuenum', 'valueuom', 
          '<30', '<60', '>365']
master_merged = master_merged[columns]


master_merged.subject_id.nunique()
master_merged.hadm_id.nunique()

150

In [22]:
## Explode the columns which have the same values

explode_colomns = ['anchor_age', 'admission_type','gender','admission_location', 
                   'insurance', 'language', 'marital_status', 'race', 'diagnosis', 'admitdate_diff', '<30', '<60', '>365']
master_merged_df = master_merged.groupby(['subject_id', 'hadm_id','valuenum', 'valueuom'])[explode_colomns].agg(list).reset_index()

## Convert 'Nan' to NaN
master_merged_df['race'] = master_merged_df['race'].replace('nan', np.nan)

In [23]:
# Apply the lambda function to each specified columnto get the unique value in the list
## This is to ensure same column have the unique value. eg. [EMERGENCY ROOM, EMERGENCY ROOM,....] to EMERGENCY ROOM
## Diagnosis is been considered because it may or may not have unique values.

for col in explode_colomns:
    if col!= 'diagnosis':
        master_merged_df[col] = master_merged_df[col].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

## Convert 'Nan' to NaN
master_merged_df['race'] = master_merged_df['race'].replace('nan', np.nan)

In [24]:
# =============================================================================
#     Create Dummies
# =============================================================================
# Perform one-hot encoding for nominal categorical variables
cat_columns=['admission_type', 'admission_location', 'insurance', 'language',
             'marital_status', 'race', 'gender', 'valueuom']

master_data = pd.get_dummies(master_merged_df, columns=cat_columns)

dummy_df = pd.get_dummies(master_merged_df['diagnosis'].explode(), prefix='diagnosis')
master_data = pd.concat([master_merged_df, dummy_df.groupby(level=0).max()], axis=1)
master_data.drop(columns=['diagnosis'], inplace=True)
print(master_data.shape)

(8941, 35)


In [25]:

cat_columns=['valueuom', 'admission_type', 'gender', 'admission_location', 
         'insurance', 'language', 'marital_status', 'race'  ]


master_data = pd.get_dummies(master_data, columns=cat_columns, drop_first=True)
print(master_data.shape)
master_data.dropna(inplace=True)
print(master_data.shape)

(8941, 80)
(8941, 80)


In [26]:
# =============================================================================
#     Create the exact same no of columns as present in the Model Building
# =============================================================================
all_columns = ['subject_id', 'hadm_id', 'valuenum', 'anchor_age', 'admitdate_diff',
               '<30', '<60', '>365',
               'diagnosis_Certain Conditions Originating in the Perinatal Period',
               'diagnosis_Complications of Pregnancy, Childbirth, and the Puerperium',
               'diagnosis_Congenital Anomalies',
               'diagnosis_Diseases of the Blood and Blood-forming Organs',
               'diagnosis_Diseases of the Circulatory System',
               'diagnosis_Diseases of the Digestive System',
               'diagnosis_Diseases of the Genitourinary System',
               'diagnosis_Diseases of the Musculoskeletal System and Connective Tissue',
               'diagnosis_Diseases of the Nervous System and Sense Organs',
               'diagnosis_Diseases of the Respiratory System',
               'diagnosis_Diseases of the Skin and Subcutaneous Tissue',
               'diagnosis_Diseases of the eye, adnexa and mastoid process',
               'diagnosis_Endocrine, Nutritional, and Metabolic Diseases',
               'diagnosis_External causes of injury and supplemental classification',
               'diagnosis_Factors influencing health status and contact with health services',
               'diagnosis_Infectious and Parasitic Diseases',
               'diagnosis_Injury and Poisoning', 'diagnosis_Mental Disorders',
               'diagnosis_Neoplasms',
               'diagnosis_Symptoms, Signs, and Ill-defined Conditions', 'valueuom_bpm',
               'valueuom_insp/min', 'valueuom_kg', 'valueuom_mg/dL', 'valueuom_mmHg',
               'valueuom_units', 'valueuom_°F', 'admission_type_Emergency',
               'admission_type_Observation',
               'admission_type_Surgical Same Day Admission', 'gender_M',
               'admission_location_CLINIC REFERRAL',
               'admission_location_EMERGENCY ROOM',
               'admission_location_INFORMATION NOT AVAILABLE',
               'admission_location_INTERNAL TRANSFER TO OR FROM PSYCH',
               'admission_location_PACU', 'admission_location_PHYSICIAN REFERRAL',
               'admission_location_PROCEDURE SITE',
               'admission_location_TRANSFER FROM HOSPITAL',
               'admission_location_TRANSFER FROM SKILLED NURSING FACILITY',
               'admission_location_WALK-IN/SELF REFERRAL', 'insurance_Medicare',
               'insurance_Other', 'language_ENGLISH', 'marital_status_MARRIED',
               'marital_status_SINGLE', 'marital_status_WIDOWED', 'race_ASIAN',
               'race_BLACK/AFRICAN', 'race_HISPANIC OR LATINO',
               'race_NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER', 'race_OTHER',
               'race_PORTUGUESE', 'race_SOUTH AMERICAN', 'race_WHITE']

master_columns = master_data.columns

In [27]:
master_merged_df.shape, master_data.shape

((8941, 17), (8941, 80))

In [28]:
# Filter out columns from all_columns that are not present in master_columns
missing_columns = [column for column in all_columns if column not in master_columns]
# Fill missing columns with False
for column in missing_columns:
    master_data[column] = False

## Rearrange the columns
master_data = master_data[all_columns]
demo_data_df = master_data.drop(['subject_id', 'hadm_id', 'admitdate_diff',
                                '<30', '<60', '>365'], axis=1)

# demo_data.drop_duplicates(inplace=True)

In [29]:
master_merged_df.shape, master_data.shape, demo_data_df.shape

((8941, 17), (8941, 63), (8941, 57))

In [30]:

def load_model(master_data):
    # Load the pickled model
    
    forecasts = pd.DataFrame()
    with open('Models/readmission_model.pkl', 'rb') as file:
        loaded_model = pickle.load(file)
        
    for key, model_path in loaded_model.items():
        forecasts[key] = model_path.predict(master_data)
        
    return forecasts
 
forecasts = load_model(demo_data_df)
forecasts

Unnamed: 0,30_days,60_days,365_days
0,0,0,1
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,1
...,...,...,...
8936,0,0,0
8937,0,0,0
8938,0,0,0
8939,0,0,0


In [31]:
forecasts.shape, demo_data.shape

((8941, 3), (150, 10))

In [37]:
master_merged_df.columns

Index(['subject_id', 'hadm_id', 'valuenum', 'valueuom', 'anchor_age',
       'admission_type', 'gender', 'admission_location', 'insurance',
       'language', 'marital_status', 'race', 'diagnosis', 'admitdate_diff',
       '<30', '<60', '>365'],
      dtype='object')

In [32]:
final = pd.concat([master_merged_df[['subject_id', 'hadm_id', 'anchor_age',
                                             'admission_type', 'gender', 'admission_location',
                                             'marital_status', 'race']], 
                           forecasts], axis=1)
final.head(10)

Unnamed: 0,subject_id,hadm_id,anchor_age,admission_type,gender,admission_location,marital_status,race,30_days,60_days,365_days
0,10169750,21487351,31,Emergency,F,PHYSICIAN REFERRAL,MARRIED,WHITE,0,0,1
1,10297112,23661596,67,Emergency,M,PHYSICIAN REFERRAL,MARRIED,WHITE,0,0,0
2,10297112,24851763,67,Surgical Same Day Admission,M,PHYSICIAN REFERRAL,MARRIED,WHITE,0,0,0
3,10297112,24851763,67,Surgical Same Day Admission,M,PHYSICIAN REFERRAL,MARRIED,WHITE,0,0,0
4,10297112,24851763,67,Surgical Same Day Admission,M,PHYSICIAN REFERRAL,MARRIED,WHITE,0,0,1
5,10297112,24851763,67,Surgical Same Day Admission,M,PHYSICIAN REFERRAL,MARRIED,WHITE,0,0,1
6,10297112,24851763,67,Surgical Same Day Admission,M,PHYSICIAN REFERRAL,MARRIED,WHITE,0,0,1
7,10297112,24851763,67,Surgical Same Day Admission,M,PHYSICIAN REFERRAL,MARRIED,WHITE,0,0,1
8,10297112,24851763,67,Surgical Same Day Admission,M,PHYSICIAN REFERRAL,MARRIED,WHITE,0,0,0
9,10297112,24851763,67,Surgical Same Day Admission,M,PHYSICIAN REFERRAL,MARRIED,WHITE,0,0,1


In [33]:
final.shape

(8941, 11)

In [66]:
df = final.groupby(['subject_id', 'hadm_id', 'anchor_age',
                        'admission_type', 'gender', 'admission_location',
                        'marital_status', 'race'])[['30_days','60_days','365_days']].max().reset_index()
df.head()

Unnamed: 0,subject_id,hadm_id,anchor_age,admission_type,gender,admission_location,marital_status,race,30_days,60_days,365_days
0,10169750,21487351,31,Emergency,F,PHYSICIAN REFERRAL,MARRIED,WHITE,0,0,1
1,10297112,23661596,67,Emergency,M,PHYSICIAN REFERRAL,MARRIED,WHITE,0,0,0
2,10297112,24851763,67,Surgical Same Day Admission,M,PHYSICIAN REFERRAL,MARRIED,WHITE,0,0,1
3,10355373,23079531,60,Observation,M,PHYSICIAN REFERRAL,MARRIED,OTHER,1,1,0
4,10367532,23086146,60,Emergency,F,TRANSFER FROM HOSPITAL,MARRIED,WHITE,0,0,0


In [67]:
df.shape

(150, 11)

In [101]:
final_demo_st = pd.merge(demo_data, df, on=['hadm_id', 'admission_location', 'marital_status'], how='left')
final_demo_st = final_demo_st[['subject_id_x', 'hadm_id', 'dischtime', 'anchor_age', 'admission_type_y', 'race_y', 'gender',
                               'admission_location', 'marital_status', '30_days', '60_days', '365_days']]
final_demo_st.rename(columns={'admission_type_y': 'admission_type',
                              'subject_id_x': 'subject_id',
                              'race_y': 'race'}, inplace=True)

final_demo_st.head()

Unnamed: 0,subject_id,hadm_id,dischtime,anchor_age,admission_type,race,gender,admission_location,marital_status,30_days,60_days,365_days
0,12202842,26238645,2/5/2024,72,Surgical Same Day Admission,OTHER,F,PHYSICIAN REFERRAL,MARRIED,0,0,0
1,13700862,26342274,2/6/2024,57,Elective,WHITE,F,PHYSICIAN REFERRAL,MARRIED,0,0,0
2,12827707,28832353,2/3/2024,33,Observation,WHITE,M,PHYSICIAN REFERRAL,SINGLE,0,0,0
3,13662342,23248569,2/8/2024,60,Elective,ASIAN,M,PHYSICIAN REFERRAL,MARRIED,0,1,0
4,17467543,20514821,2/7/2024,38,Surgical Same Day Admission,ASIAN,F,PHYSICIAN REFERRAL,SINGLE,0,0,0


In [108]:
final_demo_st['dischtime'] = pd.to_datetime(final_demo_st['dischtime']).dt.strftime('%d/%m/%Y')
final_demo_st.head()


Unnamed: 0,subject_id,hadm_id,dischtime,anchor_age,admission_type,race,gender,admission_location,marital_status,30_days,60_days,365_days
0,12202842,26238645,05/02/2024,72,Surgical Same Day Admission,OTHER,F,PHYSICIAN REFERRAL,MARRIED,0,0,0
1,13700862,26342274,06/02/2024,57,Elective,WHITE,F,PHYSICIAN REFERRAL,MARRIED,0,0,0
2,12827707,28832353,03/02/2024,33,Observation,WHITE,M,PHYSICIAN REFERRAL,SINGLE,0,0,0
3,13662342,23248569,08/02/2024,60,Elective,ASIAN,M,PHYSICIAN REFERRAL,MARRIED,0,1,0
4,17467543,20514821,07/02/2024,38,Surgical Same Day Admission,ASIAN,F,PHYSICIAN REFERRAL,SINGLE,0,0,0


In [109]:
final_demo_st.shape

(150, 12)

In [110]:
final_demo_st.to_csv('Data/Readmission_static.csv', index=False)

In [111]:
final_demo_st.hadm_id.nunique(), df.hadm_id.nunique(), demo_data.hadm_id.nunique(),

(150, 150, 150)