In [1]:
import sys
import numpy as np
import pandas as pd
from pickle import dump
import re

In [2]:
def loadS(dataDirectory):
    S = []
    obs_jumps = []
    T = []
    with open(dataDirectory+'/S.txt') as sFile:
        for line in sFile:
            tval = 0
            prevtime = 0
            for pair in line[:-1].split(',')[1:]:
                time,sval = pair.split(':')
                time = int(time)
                sval = int(sval)
                if time >= 0:
                    tval += 1
                    obs_jumps.append(time-prevtime)
                    prevtime = time
                    S.append(sval)
            T.append(tval)
    sFile.close()
    return np.asarray(S), np.asarray(obs_jumps), np.asarray(T)

In [3]:
def loadAnchors(dataDirectory):
    icd9Map = {}
    with open(dataDirectory+'/fid.txt') as mapFile:
        for i,icd9 in enumerate(mapFile):
            icd9Map[icd9.strip()] = i
    mapFile.close()
    #print icd9Map
    comorbidityNames = []
    anchors = []
    with open(dataDirectory+'/anchor_icd9.csv') as anchorFile:
        for i,line in enumerate(anchorFile):
            text = line.strip().split(',')
            comorbidityNames.append(text[0])
            comorbAnchors = []
            for codeStr in text[1:]:
                for key in icd9Map.keys():
                    l = re.search(codeStr,key)
                    if l is not None:
                        comorbAnchors.append(icd9Map[l.group(0)])
            anchors.append((i,comorbAnchors))
    anchorFile.close()
    return anchors,comorbidityNames

In [4]:
dataDirectory = 'test_small'
#outDirectory = 'small_sample'
outDirectory = 'small_random_sample'

In [5]:
Q = np.loadtxt(dataDirectory+'/Q.txt')
pi = np.loadtxt(dataDirectory+'/pi.txt')
S,obs_jumps,T = loadS(dataDirectory)
#B0(K,M)
B0 = np.loadtxt(dataDirectory+'/piB.txt')
#B(K,M)
B = np.loadtxt(dataDirectory+'/B.txt')
Z = np.loadtxt(dataDirectory+'/Z.txt')
L = np.loadtxt(dataDirectory+'/L.txt')
anchors,comorbidityNames = loadAnchors(dataDirectory)
nObs = S.shape[0]
N = T.shape[0]
M = pi.shape[0]
K,D = Z.shape
zeroIndices = np.roll(T.cumsum(),1)                                                                                                                                                               
zeroIndices[0] = 0

In [6]:
#DES Random inputs
ranSeed = 144
np.random.seed(ranSeed)
np.random.shuffle(L)
np.random.seed(ranSeed+1)
Z = np.random.rand(K,D)
np.random.seed(ranSeed+2)
B = np.random.rand(K,M)
np.random.seed(ranSeed+3)
B0 = np.random.rand(K,M)
B0.sort(axis=1)
np.random.seed(ranSeed+4)
pi = np.random.rand(M)*(1-M*0.001)+0.001*M
pi = pi/pi.sum()
np.random.seed(ranSeed+5)
Qvals = np.random.rand(M-1)
Q = np.zeros((M,M))

In [7]:
np.random.seed(39393)
#Compute X
X = np.zeros((nObs,K))
X[zeroIndices] = np.random.binomial(n=1,p=B0[:,S[zeroIndices]].T)
for k in range(K):
    for n in range(N):
        n0 = zeroIndices[n]
        if X[n0,k] == 1:
            X[zeroIndices[n]:(zeroIndices[n]+T[n]),k] = 1
        else:
            changed = np.diff(S[zeroIndices[n]:(zeroIndices[n]+T[n])])
            for t in range(1,T[n]):
                if changed[t-1]==1 and np.random.rand()<B[k,S[n0+t]]:
                        X[(n0+t):(zeroIndices[n]+T[n]),k] = 1
                        break

#Compute O
pO = (1.-(1.-L)*(1.-X[:,:,np.newaxis]*Z[np.newaxis,:,:]).prod(axis=1))
#pO = X.dot(Z)
OCount = np.random.binomial(n=1,p=pO)
O = (OCount*np.arange(1,D+1))
O = O-1
O[:,::-1].sort(axis=1)

#Compute pi from actual start times
pi = np.bincount(S[zeroIndices])/float(np.bincount(S[zeroIndices]).sum())

#Clean up types
S = S.astype(np.int32)
X = X.astype(np.int8)

In [8]:
#Write pickled files
variables = [Q,pi,S,T,obs_jumps,B0,B,X,Z,L,O,anchors,comorbidityNames]
names = ['Q','pi','S','T','obs_jumps','B0','B','X','Z','L','O','anchors','comorbidityNames']
for var,name in zip(variables,names):
    outfile = open(outDirectory+'/'+name+'.pkl','wb')
    dump(var,outfile)
    outfile.close()