In [123]:
#encoding=utf-8
import sys
import os
import time
import numpy as np
import theano
import theano.typed_list
import theano.tensor as T
import matplotlib
import matplotlib.pyplot as plt
from sklearn import cross_validation
from IPython.display import Image
from IPython.display import SVG

%matplotlib inline

In [2]:
_LEARNING_RATE = float(os.environ.get('LEARNING_RATE', '0.005'))

In [3]:
patient_to_records = {}
disease_set = set()
infile = open("../data/patient_visit_record.csv","rb")
for row in infile:    
    row = row.strip().decode('utf-8')
    items = row.split(',')
    if len(items) != 3: continue
    pid = int(items[0])
    disease = items[1]
    visit_date_time = items[2]
    #unknown data format exception
    if u"0001-01-01" in visit_date_time:
        continue
    visit_date = visit_date_time.split()[0]
    visit_time = visit_date_time.split()[1]
    visit_date_format = time.strptime(visit_date, "%Y-%m-%d")
    patient_to_records.setdefault(pid,{}).setdefault(visit_date_format,[]).append(disease)
    disease_set.add(disease)
print "patient size: "+str(len(patient_to_records.keys()))

patient size: 141007


In [4]:
disease_to_index = {}
index_to_disease = {}
disease_index = 0
for disease in disease_set:
    disease_to_index[disease] = disease_index
    index_to_disease[disease_index] = disease
    disease_index += 1
disease_size = len(disease_to_index.keys())
print "disease size: "+str(disease_size)

disease size: 4006


In [5]:
datas = []
targets = []
MIN_VISIT_NUM = 2
#one interval_unit represents k days, here we define k = 7 which is a week
interval_unit = 7 
for patient in patient_to_records.keys():
    records = []
    records_sorted = sorted(patient_to_records[patient].items(),key=lambda p:p[0],reverse=False)
    pre_visit_date = None
    for record in records_sorted:
        visit_date = record[0]
        visit_day_index = (visit_date[0] - 2012)*365 + (visit_date[7])
        diseases = record[1]
        disease_index_list = [disease_to_index[disease] for disease in diseases] 
        visit_index_unitize = int(visit_day_index / interval_unit)
        records.append(disease_index_list)
        pre_visit_date = visit_date
    if len(records) < MIN_VISIT_NUM: continue
    datas.append(records[:-1])
    targets.append(records[1:])

In [6]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(datas, targets, test_size=0.1, random_state=0)
print "train size: "+str(len(X_train))
print "test size: "+str(len(X_test))

train size: 125703
test size: 13967


In [131]:
class RNNTheano:
    def __init__(self, feature_dim, hidden_dim=100, bptt_truncate=4):
        # Assign instance variables
        self.feature_dim = feature_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Randomly initialize the network parameters
        U = np.random.uniform(-np.sqrt(1./feature_dim), np.sqrt(1./feature_dim), (hidden_dim, feature_dim))
        V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (feature_dim, hidden_dim))
        W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
        # Theano: Created shared variables
        self.U = theano.shared(name='U', value=U.astype(theano.config.floatX))
        self.V = theano.shared(name='V', value=V.astype(theano.config.floatX))
        self.W = theano.shared(name='W', value=W.astype(theano.config.floatX)) 
        # We store the Theano graph here
        self.theano = {}
        self.__theano_build__()
    
    def __theano_build__(self): 
        U, V, W = self.U, self.V, self.W
        x = T.imatrix('x')
        y = T.imatrix('y')
        
        def forward_prop_step(x_t, s_t_prev, U, V, W):
            s_t = T.tanh(T.sum(U[:,x_t],axis=1) + W.dot(s_t_prev))
            o_t = T.nnet.softmax(V.dot(s_t))[0]
            return [o_t, s_t]
        
        [o,s], updates = theano.scan(
            forward_prop_step,
            sequences=x,
            outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))],
            non_sequences=[U, V, W],
            truncate_gradient=self.bptt_truncate,
            strict=True
        )
        
        o_error, updates = theano.scan(
            lambda y_t,o_t: T.sum(-1 * T.log(o_t[y_t])),
            sequences=[y,o]
        )

        self.forward_propagation = theano.function([x], o)
        self.ce_error = theano.function([x,y], o_error)

In [132]:
model = RNNTheano(disease_size, hidden_dim=100)

In [133]:
print X_train[23]
print y_train[23]
model.ce_error(X_train[23],[[1, 3], [4, 5]])

[[3830], [2635], [339]]
[[2635], [339], [1946, 3212]]


array([ 16.59121603,  16.58902021])