# Master's Thesis Analysis

Predicting discharge times of patients??

#### Author: Derek Lee

## Load Requisite Libraries and authenticate to local Postgres database

In [6]:
import pandas as pd
import psycopg2
import sqlalchemy
import time
import psutil
import numpy as np
import multiprocessing as mp

In [7]:
# Check the number of cores and memory usage
num_cores = mp.cpu_count()
print("This kernel has ",num_cores,"cores and you can find the information regarding the memory usage:",psutil.virtual_memory())

This kernel has  8 cores and you can find the information regarding the memory usage: svmem(total=17179869184, available=6146654208, percent=64.2, used=9423228928, free=62337024, active=6117199872, inactive=6080483328, wired=3306029056)


In [2]:
%load_ext sql
from sqlalchemy import create_engine

In [3]:
# Format
# engine = create_engine('dialect+driver://username:password@host:port/database')
# Example format
engine = create_engine('postgresql://postgres:postgres@localhost:5432/mimic')

In [4]:
import json
json_file = open("secrets.json")
variables = json.load(json_file)
json_file.close()

import os
os.environ['POSTGRES_USERNAME'] = variables['POSTGRES_USERNAME']
os.environ['POSTGRES_PASSWORD'] = variables['POSTGRES_PASSWORD']
os.environ['POSTGRES_ADDRESS'] = variables['POSTGRES_ADDRESS']
os.environ['POSTGRES_PORT'] = variables['POSTGRES_PORT']
os.environ['POSTGRES_DBNAME'] = variables['POSTGRES_DBNAME']

In [5]:
# A long string that contains the necessary Postgres login information 
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
                .format(username=os.getenv('POSTGRES_USERNAME'),
                        password=os.getenv('POSTGRES_PASSWORD'),
                        ipaddress=os.getenv('POSTGRES_ADDRESS'),
                        port=os.getenv('POSTGRES_PORT'),
                        dbname=os.getenv('POSTGRES_DBNAME'))) 
# Create the connection 
engine = create_engine(postgres_str)

-------------------------------------

### Note: Large Dataset:
Running below will take several minutes because it's a dataset with approx 20 million rows

In [8]:
data = pd.read_sql('''SELECT * FROM public."thesis_data";''', engine)

In [9]:
data.head()

Unnamed: 0,gender,dob,dod,dod_hosp,dod_ssn,expire_flag,row_id,subject_id,hadm_id,admittime,...,insurance,language,religion,marital_status,ethnicity,edregtime,edouttime,diagnosis,hospital_expire_flag,has_chartevents_data
0,M,2088-08-07,2170-10-19,2170-10-19,2170-10-19,1,351,275,129886,2170-10-06 03:09:00,...,Medicare,RUSS,NOT SPECIFIED,MARRIED,WHITE,2170-10-05 17:56:00,2170-10-06 04:27:00,UROSEPSIS-CHANGE IN MENTAL STATUS,1,1
1,M,2088-08-07,2170-10-19,2170-10-19,2170-10-19,1,351,275,129886,2170-10-06 03:09:00,...,Medicare,RUSS,NOT SPECIFIED,MARRIED,WHITE,2170-10-05 17:56:00,2170-10-06 04:27:00,UROSEPSIS-CHANGE IN MENTAL STATUS,1,1
2,M,2088-08-07,2170-10-19,2170-10-19,2170-10-19,1,351,275,129886,2170-10-06 03:09:00,...,Medicare,RUSS,NOT SPECIFIED,MARRIED,WHITE,2170-10-05 17:56:00,2170-10-06 04:27:00,UROSEPSIS-CHANGE IN MENTAL STATUS,1,1
3,M,2088-08-07,2170-10-19,2170-10-19,2170-10-19,1,351,275,129886,2170-10-06 03:09:00,...,Medicare,RUSS,NOT SPECIFIED,MARRIED,WHITE,2170-10-05 17:56:00,2170-10-06 04:27:00,UROSEPSIS-CHANGE IN MENTAL STATUS,1,1
4,M,2088-08-07,2170-10-19,2170-10-19,2170-10-19,1,351,275,129886,2170-10-06 03:09:00,...,Medicare,RUSS,NOT SPECIFIED,MARRIED,WHITE,2170-10-05 17:56:00,2170-10-06 04:27:00,UROSEPSIS-CHANGE IN MENTAL STATUS,1,1


In [11]:
data.shape

(21197220, 25)

-----------------------------------------------------------------------

## Exploratory Data Analysis

Our target variable is