In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import statistics
import matplotlib.pyplot as plt

In [2]:
feature_table = pd.read_csv('./data/feature_with_label_all_48H.csv')

In [3]:
patients_only = feature_table.loc[feature_table['bin_num'] == 1]

### SQL Database

In [4]:
import sqlite3 # library for working with sqlite database
conn = sqlite3.connect("./data/MIMIC.db") # Create a connection to the on-disk database

In [5]:
pd.read_sql("SELECT * FROM sqlite_master where type='table'", conn)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,CHARTEVENTS,CHARTEVENTS,2,"CREATE TABLE CHARTEVENTS(\n ""ROW_ID"" TEXT,\n ..."
1,table,PROCEDUREEVENTS_MV,PROCEDUREEVENTS_MV,9120960,"CREATE TABLE ""PROCEDUREEVENTS_MV"" (\n""index"" I..."
2,table,CALLOUT,CALLOUT,9132354,"CREATE TABLE ""CALLOUT"" (\n""index"" INTEGER,\n ..."
3,table,D_CPT,D_CPT,9133903,"CREATE TABLE ""D_CPT"" (\n""index"" INTEGER,\n ""R..."
4,table,D_ITEMS,D_ITEMS,9133909,"CREATE TABLE ""D_ITEMS"" (\n""index"" INTEGER,\n ..."
5,table,CAREGIVERS,CAREGIVERS,9134171,"CREATE TABLE ""CAREGIVERS"" (\n""index"" INTEGER,\..."
6,table,MICROBIOLOGYEVENTS,MICROBIOLOGYEVENTS,9134246,"CREATE TABLE ""MICROBIOLOGYEVENTS"" (\n""index"" I..."
7,table,LABEVENTS,LABEVENTS,9153054,"CREATE TABLE ""LABEVENTS"" (\n""index"" INTEGER,\n..."
8,table,INPUTEVENTS_CV,INPUTEVENTS_CV,9708019,"CREATE TABLE ""INPUTEVENTS_CV"" (\n""index"" INTEG..."
9,table,ADMISSIONS,ADMISSIONS,10291267,"CREATE TABLE ""ADMISSIONS"" (\n""index"" INTEGER,\..."


In [6]:
admissions_df = pd.read_sql("""SELECT *
                            FROM admissions
                            """, conn)

In [7]:
patients_df = pd.read_sql("""SELECT *
                            FROM patients
                            """, conn)

In [8]:
# Join admissions and patient
patient_admissions_df = pd.read_sql("""SELECT *
            FROM admissions 
            LEFT JOIN patients ON admissions.SUBJECT_ID = patients.SUBJECT_ID""", conn)

In [9]:
patient_admissions_df

Unnamed: 0,index,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,...,HAS_CHARTEVENTS_DATA,index.1,ROW_ID.1,SUBJECT_ID.1,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,...,1,240,19,22,F,2131-05-07 00:00:00,,,,0
1,1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,...,1,241,20,23,M,2082-07-17 00:00:00,,,,0
2,2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,...,1,241,20,23,M,2082-07-17 00:00:00,,,,0
3,3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,...,1,242,21,24,M,2100-05-31 00:00:00,,,,0
4,4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,...,1,243,22,25,M,2101-11-21 00:00:00,,,,0
5,5,26,26,197661,2126-05-06 15:16:00,2126-05-13 15:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,...,1,244,23,26,M,2054-05-04 00:00:00,2128-02-25 00:00:00,,2128-02-25 00:00:00,1
6,6,27,27,134931,2191-11-30 22:16:00,2191-12-03 14:45:00,,NEWBORN,PHYS REFERRAL/NORMAL DELI,HOME,...,1,245,24,27,F,2191-11-30 00:00:00,,,,0
7,7,28,28,162569,2177-09-01 07:15:00,2177-09-06 16:00:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,...,1,246,25,28,M,2103-04-15 00:00:00,,,,0
8,8,29,30,104557,2172-10-14 14:17:00,2172-10-19 14:37:00,,URGENT,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,...,1,247,26,30,M,1872-10-14 00:00:00,,,,0
9,9,30,31,128652,2108-08-22 23:27:00,2108-08-30 15:00:00,2108-08-30 15:00:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,DEAD/EXPIRED,...,1,248,27,31,M,2036-05-17 00:00:00,2108-08-30 00:00:00,2108-08-30 00:00:00,2108-08-30 00:00:00,1


In [10]:
# Pull the admissions that we have in our 48 hour table
used_patients_admissions = pd.DataFrame(patient_admissions_df.loc[patient_admissions_df['HADM_ID'].isin(patients_only['HADM_ID'].values)])

In [11]:
percent_admissions_used = used_patients_admissions.shape[0]/admissions_df.shape[0] * 100
percent_admissions_used

92.14426207270753

In [12]:
# Number of patients in our 48 hour table
number_patients = used_patients_admissions['SUBJECT_ID'].iloc[:,1].value_counts().shape[0]

In [13]:
number_patients

43175

In [14]:
# Create a patient df with only the patients in our 48 hour table
used_patients = patients_df.loc[patients_df['SUBJECT_ID'].isin(used_patients_admissions['SUBJECT_ID'].iloc[:,1].values)]

In [15]:
percent_male = (used_patients['GENDER'].value_counts()[0] / number_patients) * 100
percent_male

56.18760856977417

In [19]:
# Make an age column (age at admit)
used_patients_admissions['DOB'] = pd.to_datetime(used_patients_admissions['DOB'])
used_patients_admissions['ADMITTIME'] = pd.to_datetime(used_patients_admissions['ADMITTIME'])
used_patients_admissions['AGE'] = ((used_patients_admissions['ADMITTIME'] - used_patients_admissions['DOB']) / 365)

OverflowError: Overflow in int64 addition

In [None]:
used_patients_admissions[used_patients_admissions['AGE'] >= 16]['AGE'].describe()

In [None]:
# Subject IDs of patients over 16
patients_16over = used_patients_admissions[used_patients_admissions['AGE'] >= 16]['SUBJECT_ID'].iloc[:,1].values

In [None]:
# HADM_IDs of patients over 16 years of age
HADM_ID_16_over = used_patients_admissions.loc[used_patients_admissions['SUBJECT_ID'].iloc[:,1].isin(patients_16over)]['HADM_ID'].values

In [None]:
# How many patients do we have that are over 16 years old?
num_patients_16_over = used_patients_admissions[used_patients_admissions['AGE'] >= 16]['SUBJECT_ID'].iloc[:,1].value_counts().shape[0]

In [None]:
num_patients_16_over

In [None]:
# Take only the HADM_IDs from the 48 hour table of patients over 16
feature_table[feature_table['HADM_ID'].isin(HADM_ID_16_over)]
