# Master's Thesis Analysis

Predicting discharge times of patients??

#### Author: Derek Lee

## Load Requisite Libraries and authenticate to local Postgres database

In [1]:
import pandas as pd
import psycopg2
import sqlalchemy
import time
import psutil
import numpy as np
import multiprocessing as mp

In [None]:
# Check the number of cores and memory usage
num_cores = mp.cpu_count()
print("This kernel has ",num_cores,"cores and you can find the information regarding the memory usage:",psutil.virtual_memory())

In [2]:
%load_ext sql
from sqlalchemy import create_engine

In [3]:
# Format
# engine = create_engine('dialect+driver://username:password@host:port/database')
# Example format
engine = create_engine('postgresql://postgres:postgres@localhost:5432/mimic')

In [4]:
import json
json_file = open("secrets.json")
variables = json.load(json_file)
json_file.close()

import os
os.environ['POSTGRES_USERNAME'] = variables['POSTGRES_USERNAME']
os.environ['POSTGRES_PASSWORD'] = variables['POSTGRES_PASSWORD']
os.environ['POSTGRES_ADDRESS'] = variables['POSTGRES_ADDRESS']
os.environ['POSTGRES_PORT'] = variables['POSTGRES_PORT']
os.environ['POSTGRES_DBNAME'] = variables['POSTGRES_DBNAME']

In [5]:
# A long string that contains the necessary Postgres login information 
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
                .format(username=os.getenv('POSTGRES_USERNAME'),
                        password=os.getenv('POSTGRES_PASSWORD'),
                        ipaddress=os.getenv('POSTGRES_ADDRESS'),
                        port=os.getenv('POSTGRES_PORT'),
                        dbname=os.getenv('POSTGRES_DBNAME'))) 
# Create the connection 
engine = create_engine(postgres_str)

-------------------------------------

### Note: Large Dataset:
Running below will take about 5 minutes because it's a dataset with approx 20 million rows

In [None]:
from datetime import datetime 

start_time = datetime.now() 

import time

data = pd.read_sql('''SELECT * FROM public."thesis_data";''', con=engine, chunksize = 1000)


# data = pd.read_sql('''SELECT * FROM public."thesis_data";''', con=engine, chunksize=1000000)


print('Time elapsed (hh:mm:ss.ms) {}'.format(datetime.now() - start_time))

In [None]:
data

In [6]:
from datetime import datetime 

start_time = datetime.now() 

i = 0
dflist = []
dfs=pd.DataFrame()
sql = '''SELECT * FROM public."thesis_data"'''
for chunk in pd.read_sql_query(sql , engine, chunksize=100):
    if i <101:
        dflist.append(chunk)
        i+=1
    else:
        break
        
dfs = pd.concat(dflist, ignore_index=True)

print('Time elapsed (hh:mm:ss.ms) {}'.format(datetime.now() - start_time))

Time elapsed (hh:mm:ss.ms) 0:05:28.074764


In [7]:
dfs.sample(100)

Unnamed: 0,gender,dob,dod,dod_hosp,dod_ssn,expire_flag,row_id,subject_id,hadm_id,admittime,...,cpt_number,cpt_suffix,sectionheader,subsectionheader,description,transfertime,prev_service,curr_service,seq_num,icd9_code
666,M,2047-04-04,2135-02-08 00:00:00,2135-02-08 00:00:00,2135-02-08 00:00:00,1,19,21,109451,2134-09-11 12:17:00,...,90937,,Medicine,Dialysis,,2134-09-11 12:18:39,,CMED,11,9904
8407,F,2117-08-07,2142-08-30 00:00:00,2142-08-30 00:00:00,2142-08-30 00:00:00,1,129,109,126055,2141-10-13 23:10:00,...,99233,,Evaluation and management,Hospital inpatient services,,2141-10-27 10:57:57,MED,SURG,1,5411
5371,M,2076-05-13,2133-09-30 00:00:00,,2133-09-30 00:00:00,1,42,41,101757,2132-12-31 10:30:00,...,31624,,Surgery,Respiratory system,,2133-01-12 22:44:32,NMED,MED,6,3326
3814,M,2076-05-13,2133-09-30 00:00:00,,2133-09-30 00:00:00,1,42,41,101757,2132-12-31 10:30:00,...,99232,,Evaluation and management,Hospital inpatient services,,2133-01-12 15:51:03,MED,NMED,3,4311
8806,F,2117-08-07,2142-08-30 00:00:00,2142-08-30 00:00:00,2142-08-30 00:00:00,1,129,109,126055,2141-10-13 23:10:00,...,99254,,Evaluation and management,Consultations,,2141-10-27 10:57:57,MED,SURG,6,9904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5306,M,2076-05-13,2133-09-30 00:00:00,,2133-09-30 00:00:00,1,42,41,101757,2132-12-31 10:30:00,...,99232,,Evaluation and management,Hospital inpatient services,,2133-01-12 15:51:03,MED,NMED,5,9671
3563,M,2076-05-13,2133-09-30 00:00:00,,2133-09-30 00:00:00,1,42,41,101757,2132-12-31 10:30:00,...,99232,,Evaluation and management,Hospital inpatient services,,2133-01-12 22:44:32,NMED,MED,1,0124
486,M,2047-04-04,2135-02-08 00:00:00,2135-02-08 00:00:00,2135-02-08 00:00:00,1,19,21,109451,2134-09-11 12:17:00,...,99291,,Evaluation and management,Critical care services,,2134-09-11 12:18:39,,CMED,5,0042
2225,M,2076-05-13,2133-09-30 00:00:00,,2133-09-30 00:00:00,1,42,41,101757,2132-12-31 10:30:00,...,99291,,Evaluation and management,Critical care services,,2133-01-10 12:57:06,NSURG,MED,10,9904


In [None]:
data.columns

In [None]:
minidata = data.sample(10000)
minidata

-----------------------------------------------------------------------

In [None]:
minidata.dtypes

In [None]:
len(data) - (data['expire_flag']==1).sum()

In [None]:
minidata['edouttime'].isna().sum()

In [None]:
minidata['diff'] = minidata['edouttime'] - minidata['edregtime'].dt.minute

Our target variable is the time it takes from when a person is ready for discharge to when they are actually discharged. This needs to be engineered from our data.

## Exploratory Data Analysis

Now we want to make a correlation matrix of our data