In [24]:
import math
import numpy as np
import scipy
import scipy.stats as ss
import sklearn.cluster as skc
import matplotlib.pyplot as plt
import pandas
import time
from time import mktime
from datetime import datetime

#suppresses pandas indexing warning for cleaner output
#don't worry, I know what I'm doing :)
pandas.options.mode.chained_assignment = None  # default='warn'

admissions = pandas.read_csv('data/ADMISSIONS.csv')
patients = pandas.read_csv('data/PATIENTS.csv')
diagnoses_icd = pandas.read_csv('data/DIAGNOSES_ICD.csv')
d_icd_diagnoses = pandas.read_csv('data/D_ICD_DIAGNOSES.csv')
services = pandas.read_csv('data/SERVICES.csv')

In [25]:
#concatenate data
data = pandas.merge(admissions,patients,on='SUBJECT_ID',how='left')
data = pandas.merge(data,services,on='HADM_ID',how='left')
data.drop('SUBJECT_ID_y', axis=1, inplace=True)
data.drop('PREV_SERVICE', axis=1, inplace=True)
data.drop('ROW_ID_x', axis=1, inplace=True)
data.rename(columns={'SUBJECT_ID_x': 'SUBJECT_ID'}, inplace=True)

data_diagnoses_sparse = data.copy()

In [26]:
#parses date input of the form "YYYY-MM-DD HH:MM:SS"
#converts it to a datetime
def parse_time(time_str):
    if (pandas.notnull(time_str)):
        temp_time = time.strptime(time_str, "%Y-%m-%d %H:%M:%S")
        return datetime.fromtimestamp(mktime(temp_time))
    else:
        return time_str
    
def sec_between(x,y):
    return (parse_time(y) - parse_time(x)).total_seconds()

def calc_staylength(row):
    return sec_between(row['ADMITTIME'],row['DISCHTIME'])

def calc_age(row):
    return sec_between(row['DOB'],row['ADMITTIME'])

In [None]:
data.reset_index(drop=True,inplace=True)
data['STAYLENGTH'] = data.apply(lambda row: calc_staylength(row),axis=1)
data['GENDER'] = data['GENDER'].map({'F':0,'M':1})

In [None]:
data['NEXTVISIT']=np.nan

sec_in_day = (60*60*24)
large_offset = 10000

for i in range(len(data)-1):
    if(data.ix[i,'SUBJECT_ID'] == data.ix[i+1,'SUBJECT_ID']):
        data.ix[i,'NEXTVISIT'] = sec_between(data.ix[i,'ADMITTIME'],data.ix[i+1,'ADMITTIME'])
    else:
        data.ix[i,'NEXTVISIT'] = sec_in_day * large_offset
data.ix[len(data)-1,'NEXTVISIT'] = sec_in_day * large_offset

data['STAYLENGTH'] = data['STAYLENGTH'] / sec_in_day

In [None]:
data.to_csv('concatenated_data.csv', encoding='utf-8')