# Master's Thesis Analysis

Predicting discharge times of patients??

#### Author: Derek Lee

## Load Requisite Libraries and authenticate to local Postgres database

In [1]:
import pandas as pd
import psycopg2
import sqlalchemy
import time
import psutil
import numpy as np
import multiprocessing as mp

In [2]:
# Check the number of cores and memory usage
num_cores = mp.cpu_count()
print("This kernel has ",num_cores,"cores and you can find the information regarding the memory usage:",psutil.virtual_memory())

This kernel has  8 cores and you can find the information regarding the memory usage: svmem(total=17179869184, available=5131923456, percent=70.1, used=7739101184, free=88588288, active=5046460416, inactive=4780748800, wired=2692640768)


In [3]:
%load_ext sql
from sqlalchemy import create_engine

In [4]:
# Format
# engine = create_engine('dialect+driver://username:password@host:port/database')
# Example format
engine = create_engine('postgresql://postgres:postgres@localhost:5432/mimic')

In [5]:
import json
json_file = open("secrets.json")
variables = json.load(json_file)
json_file.close()

import os
os.environ['POSTGRES_USERNAME'] = variables['POSTGRES_USERNAME']
os.environ['POSTGRES_PASSWORD'] = variables['POSTGRES_PASSWORD']
os.environ['POSTGRES_ADDRESS'] = variables['POSTGRES_ADDRESS']
os.environ['POSTGRES_PORT'] = variables['POSTGRES_PORT']
os.environ['POSTGRES_DBNAME'] = variables['POSTGRES_DBNAME']

In [6]:
# A long string that contains the necessary Postgres login information 
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
                .format(username=os.getenv('POSTGRES_USERNAME'),
                        password=os.getenv('POSTGRES_PASSWORD'),
                        ipaddress=os.getenv('POSTGRES_ADDRESS'),
                        port=os.getenv('POSTGRES_PORT'),
                        dbname=os.getenv('POSTGRES_DBNAME'))) 
# Create the connection 
engine = create_engine(postgres_str)

-------------------------------------

### Note: Large Dataset:
Running below will take several minutes because it's a dataset with approx 20 million rows

In [9]:
from datetime import datetime 

start_time = datetime.now() 

import time

data = pd.read_sql('''SELECT * FROM public."thesis_data";''', con=engine)


# data = pd.read_sql('''SELECT * FROM public."thesis_data";''', con=engine, chunksize=1000000)


print('Time elapsed (hh:mm:ss.ms) {}'.format(datetime.now() - start_time))

Time elapsed (hh:mm:ss.ms) 0:20:41.655368


In [13]:
print('data Memory usage by column...')
print(data.memory_usage(index=False, deep=True) / data.shape[0])

data Memory usage by column...
gender                  66.000000
dob                      8.000000
dod                      8.000000
dod_hosp                 8.000000
dod_ssn                  8.000000
expire_flag              8.000000
row_id                   8.000000
subject_id               8.000000
hadm_id                  8.000000
admittime                8.000000
dischtime                8.000000
deathtime                8.000000
admission_type          65.816016
admission_location      80.732111
discharge_location      73.982317
insurance               64.564373
language                57.104031
religion                67.837940
marital_status          62.594770
ethnicity               66.687081
edregtime                8.000000
edouttime                8.000000
diagnosis               78.798154
hospital_expire_flag     8.000000
has_chartevents_data     8.000000
dtype: float64


In [14]:
data.head()

Unnamed: 0,gender,dob,dod,dod_hosp,dod_ssn,expire_flag,row_id,subject_id,hadm_id,admittime,...,insurance,language,religion,marital_status,ethnicity,edregtime,edouttime,diagnosis,hospital_expire_flag,has_chartevents_data
0,M,2088-08-07,2170-10-19,2170-10-19,2170-10-19,1,351,275,129886,2170-10-06 03:09:00,...,Medicare,RUSS,NOT SPECIFIED,MARRIED,WHITE,2170-10-05 17:56:00,2170-10-06 04:27:00,UROSEPSIS-CHANGE IN MENTAL STATUS,1,1
1,M,2088-08-07,2170-10-19,2170-10-19,2170-10-19,1,351,275,129886,2170-10-06 03:09:00,...,Medicare,RUSS,NOT SPECIFIED,MARRIED,WHITE,2170-10-05 17:56:00,2170-10-06 04:27:00,UROSEPSIS-CHANGE IN MENTAL STATUS,1,1
2,M,2088-08-07,2170-10-19,2170-10-19,2170-10-19,1,351,275,129886,2170-10-06 03:09:00,...,Medicare,RUSS,NOT SPECIFIED,MARRIED,WHITE,2170-10-05 17:56:00,2170-10-06 04:27:00,UROSEPSIS-CHANGE IN MENTAL STATUS,1,1
3,M,2088-08-07,2170-10-19,2170-10-19,2170-10-19,1,351,275,129886,2170-10-06 03:09:00,...,Medicare,RUSS,NOT SPECIFIED,MARRIED,WHITE,2170-10-05 17:56:00,2170-10-06 04:27:00,UROSEPSIS-CHANGE IN MENTAL STATUS,1,1
4,M,2088-08-07,2170-10-19,2170-10-19,2170-10-19,1,351,275,129886,2170-10-06 03:09:00,...,Medicare,RUSS,NOT SPECIFIED,MARRIED,WHITE,2170-10-05 17:56:00,2170-10-06 04:27:00,UROSEPSIS-CHANGE IN MENTAL STATUS,1,1


In [15]:
data.shape

(21197220, 25)

In [16]:
data.columns

Index(['gender', 'dob', 'dod', 'dod_hosp', 'dod_ssn', 'expire_flag', 'row_id',
       'subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime',
       'admission_type', 'admission_location', 'discharge_location',
       'insurance', 'language', 'religion', 'marital_status', 'ethnicity',
       'edregtime', 'edouttime', 'diagnosis', 'hospital_expire_flag',
       'has_chartevents_data'],
      dtype='object')

In [18]:
minidata = data.sample(10)
minidata

Unnamed: 0,gender,dob,dod,dod_hosp,dod_ssn,expire_flag,row_id,subject_id,hadm_id,admittime,...,insurance,language,religion,marital_status,ethnicity,edregtime,edouttime,diagnosis,hospital_expire_flag,has_chartevents_data
11293353,M,2104-03-07,2165-07-06,2165-07-06,NaT,1,48324,66508,131118,2165-02-25 11:40:00,...,Private,ENGL,PROTESTANT QUAKER,MARRIED,WHITE,NaT,NaT,MULTIPLE MYELOMA\BONE MARROW TRANSPLANT,1,1
18673882,M,2091-07-24,NaT,NaT,NaT,0,56090,91031,135028,2159-11-07 15:06:00,...,Medicare,ENGL,PROTESTANT QUAKER,MARRIED,WHITE,NaT,NaT,ETOH CIRRHOSIS,0,1
16829378,M,2111-09-19,2138-08-19,2138-08-19,2138-08-19,1,51822,77471,118886,2137-04-13 13:54:00,...,Medicaid,CAMB,NOT SPECIFIED,SINGLE,ASIAN - CAMBODIAN,NaT,NaT,FEVER,0,1
12443886,M,2104-03-07,2165-07-06,2165-07-06,NaT,1,48324,66508,131118,2165-02-25 11:40:00,...,Private,ENGL,PROTESTANT QUAKER,MARRIED,WHITE,NaT,NaT,MULTIPLE MYELOMA\BONE MARROW TRANSPLANT,1,1
17092023,F,2115-11-10,2176-05-15,2176-05-15,2176-05-15,1,56186,91333,112508,2176-02-27 17:39:00,...,Medicare,ENGL,PROTESTANT QUAKER,DIVORCED,WHITE,NaT,NaT,RECURRENT LEUKEMIA,1,1
12717967,M,2104-03-07,2165-07-06,2165-07-06,NaT,1,48324,66508,131118,2165-02-25 11:40:00,...,Private,ENGL,PROTESTANT QUAKER,MARRIED,WHITE,NaT,NaT,MULTIPLE MYELOMA\BONE MARROW TRANSPLANT,1,1
11353363,M,2104-03-07,2165-07-06,2165-07-06,NaT,1,48324,66508,131118,2165-02-25 11:40:00,...,Private,ENGL,PROTESTANT QUAKER,MARRIED,WHITE,NaT,NaT,MULTIPLE MYELOMA\BONE MARROW TRANSPLANT,1,1
16575817,M,2055-06-23,NaT,NaT,NaT,0,51243,75779,123505,2128-08-06 14:01:00,...,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,2128-08-06 10:23:00,2128-08-06 15:31:00,CONGESTIVE HEART FAILURE,0,1
1514341,F,2141-05-14,NaT,NaT,NaT,0,15247,12411,138791,2188-12-23 23:25:00,...,Medicare,ENGL,PROTESTANT QUAKER,SINGLE,BLACK/AFRICAN AMERICAN,2188-12-23 17:56:00,2188-12-24 00:15:00,ANEMIA,0,1
5159906,M,2052-08-20,NaT,NaT,NaT,0,33347,27280,106466,2105-12-22 12:27:00,...,Private,ENGL,NOT SPECIFIED,MARRIED,WHITE,2105-12-22 10:31:00,2105-12-22 14:19:00,SEPSIS,0,1


-----------------------------------------------------------------------

In [25]:
minidata.dtypes

gender                          object
dob                     datetime64[ns]
dod                     datetime64[ns]
dod_hosp                datetime64[ns]
dod_ssn                 datetime64[ns]
expire_flag                      int64
row_id                           int64
subject_id                       int64
hadm_id                          int64
admittime               datetime64[ns]
dischtime               datetime64[ns]
deathtime               datetime64[ns]
admission_type                  object
admission_location              object
discharge_location              object
insurance                       object
language                        object
religion                        object
marital_status                  object
ethnicity                       object
edregtime               datetime64[ns]
edouttime               datetime64[ns]
diagnosis                       object
hospital_expire_flag             int64
has_chartevents_data             int64
dtype: object

In [34]:
len(data) - (data['expire_flag']==1).sum()

8742260

In [27]:
minidata['edouttime'].isna().sum()

7

In [24]:
minidata['diff'] = minidata['edouttime'] - minidata['edregtime'].dt.minute

TypeError: cannot subtract float64-dtype from DatetimeArray

Our target variable is the time it takes from when a person is ready for discharge to when they are actually discharged. This needs to be engineered from our data.

## Exploratory Data Analysis

Now we want to make a correlation matrix of our data