In [8]:
import math
import numpy as np
import scipy
import scipy.stats as ss
import sklearn.cluster as skc
import matplotlib.pyplot as plt
import pandas
import time
from time import mktime
from datetime import datetime

#suppresses pandas indexing warning for cleaner output
#don't worry, I know what I'm doing :)
pandas.options.mode.chained_assignment = None  # default='warn'

admissions_import = pandas.read_csv('ADMISSIONS.csv')
patients_import = pandas.read_csv('PATIENTS.csv')

In [9]:
admissions = admissions_import.copy()
patients = patients_import.copy()
data = pandas.merge(admissions,patients,on='SUBJECT_ID',how='left')

In [10]:
#parses date input of the form "YYYY-MM-DD HH:MM:SS"
#converts it to a datetime
def parse_time(time_str):
    if (pandas.notnull(time_str)):
        temp_time = time.strptime(time_str, "%Y-%m-%d %H:%M:%S")
        return datetime.fromtimestamp(mktime(temp_time))
    else:
        return time_str
    
def sec_between(x,y):
    return (parse_time(y) - parse_time(x)).total_seconds()

def calc_staylength(row):
    return sec_between(row['ADMITTIME'],row['DISCHTIME'])

def calc_age(row):
    return sec_between(row['DOB'],row['ADMITTIME'])

def check_old(row):
    return 1 if(int((row['DOB'])[:4]) < 2000) else 0

In [11]:
data['ISOLD'] = data.apply(lambda row: check_old(row),axis=1)
data = data[data.ISOLD == 0]
data.reset_index(drop=True,inplace=True)
data['STAYLENGTH'] = data.apply(lambda row: calc_staylength(row),axis=1)
data['AGE'] = data.apply(lambda row: calc_age(row),axis=1)
data['GENDER'] = data['GENDER'].map({'F':0,'M':1})
data = data[['SUBJECT_ID','HADM_ID','ADMITTIME','AGE','STAYLENGTH','GENDER','EXPIRE_FLAG']]

In [16]:
features = data.copy()
features['NEXTVISIT']=np.nan

sec_in_day = (60*60*24)
large_offset = 10000

for i in range(len(features)-1):
    if(features.ix[i,'SUBJECT_ID'] == features.ix[i+1,'SUBJECT_ID']):
        features.ix[i,'NEXTVISIT'] = sec_between(features.ix[i,'ADMITTIME'],features.ix[i+1,'ADMITTIME'])
    else:
        features.ix[i,'NEXTVISIT'] = sec_in_day * large_offset
features.ix[len(features)-1,'NEXTVISIT'] = sec_in_day * large_offset
        
features = features[['AGE','STAYLENGTH','GENDER','EXPIRE_FLAG','NEXTVISIT']]

features['STAYLENGTH'] = features['STAYLENGTH'] / sec_in_day
features['AGE'] = features['AGE'] / sec_in_day
features['NEXTVISIT'] = features['NEXTVISIT'] / sec_in_day

In [15]:
features.shape

(56360, 5)

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

train_data = features[['AGE','STAYLENGTH','GENDER','EXPIRE_FLAG']]
staylength = features[['NEXTVISIT']]

def predict(train_data,predict,final):
    pred = LinearRegression()
    pred = pred.fit(train_data, staylength)
    print(list(train_data))
    print(pred.coef_)
        
predict(train_data,staylength,True)

['AGE', 'STAYLENGTH', 'GENDER', 'EXPIRE_FLAG']
[[ -3.04191846e-02  -9.59083297e+00   7.60880319e+01  -1.15029008e+03]]
