In [9]:
# import modules for math and distributions
from math import exp
import numpy as np
from scipy.stats import gamma, norm, uniform, bernoulli

In [10]:
# define inverse logit function
def logit_1(x): return(exp(x)/(1 + exp(x)))

In [11]:
# Initial set up according to HD2012
T = 40 # time periods
k = 5 # check-up times
theta = [-0.405, 0.0205, -0.00405]
gam = [-3, 0.05, -1.5, 0.1]

In [12]:
# define lists for holding A, L, U and Y
A = np.zeros(T + 1) # A[-1] holds the value for A in t = -1
L = np.zeros(T)
U = np.zeros(T)
Y = np.zeros(T + 1)
eps = np.zeros(T)
lam = np.zeros(T) # prob of failure at each time period
delta = np.zeros(T)

# set the first value of U, U[0], to a 
# randomly generated value from a uniform
# distribution a measure of general health
U[0] = uniform.rvs()
eps[0] = norm.rvs(0, 20)
L[0] = gamma.ppf(U[0], 3, 154)

# set A[-1] to 0
A[-1] = 0
x = logit_1(theta[0] + theta[2] * (L[0] - 500))
A[0] = bernoulli.rvs(x, size=1)

In [13]:
if A[0] == 1: Ts = 0

In [14]:
lam[0] = logit_1(gam[0] + gam[2] * A[0])

In [15]:
if lam[0] >= U[0]:
    Y[1] = 1
else:
    Y[1] = 0

In [16]:
# loop for each individual
for t in range(1, T):
    if Y[t] == 0:
        delta[t] = norm.rvs(0, 0.05)
        U[t] = min(1, max(0, U[t-1] + delta[t]))
        if t % k != 0:
            L[t] = L[t-1]
            A[t] = A[t-1]
        else:
            eps[1] = norm.rvs(100 * (U[t] - 2), 50)
            L[t] = max(0, L[t-1] + 150 * A[t-k] * (A[t-k-1]) + eps[t])
            if A[t-1] == 0:
                A[t] = bernoulli.rvs(logit_1(theta[0] + theta[1] * t + theta[2] * (L[t] - 500)), size=1)
            else:
                A[t] = 1
            if A[t] == 1 and A[t-k] == 0: Ts = t
        lam[t] = logit_1(gam[0] + gam[1] * ((1 - A[t]) * t + A[t] * Ts) + gam[2] * A[t] + gam[3] * A[t] *(t - Ts))
        if (1 - np.prod(1 - lam)) >= U[0]:
            Y[t + 1] = 1
        else:
            Y[t + 1] = 0
    else:
        break

     

In [109]:
# make a function that does all of the above for an individual
def sim(T, k, gam, theta, patid=0):
    
    # define lists for holding A, L, U and Y
    A = np.zeros(T + 1) # A[-1] holds the value for A in t = -1
    L = np.zeros(T)
    U = np.zeros(T)
    Y = np.zeros(T + 1)
    eps = np.zeros(T)
    lam = np.zeros(T) # prob of failure at each time period
    delta = np.zeros(T)

    # set the first value of U, U[0], to a 
    # randomly generated value from a uniform
    # distribution a measure of general health
    U[0] = uniform.rvs()
    eps[0] = norm.rvs(0, 20)
    L[0] = gamma.ppf(U[0], 3, scale=154) + eps[0]

    # set A[-1] to 0
    A[-1] = 0
    x = logit_1(theta[0] + theta[2] * (L[0] - 500))
    A[0] = bernoulli.rvs(x, size=1)
    
    if A[0] == 1:
        Ts = 0 
    else:
        Ts = -1
    
    lam[0] = logit_1(gam[0] + gam[2] * A[0])
    
    if lam[0] >= U[0]:
        Y[1] = 1
    else:
        Y[1] = 0
    # loop through each time period - stop when patient is dead or t = T + 1
    for t in range(1, T):
        if Y[t] == 0:
            delta[t] = norm.rvs(0, 0.05)
            U[t] = min(1, max(0, U[t-1] + delta[t]))
            if t % k != 0:
                L[t] = L[t-1]
                A[t] = A[t-1]
            else:
                eps[t] = norm.rvs(100 * (U[t] - 2), 50)
                L[t] = max(0, L[t-1] + 150 * A[t-k] * (1-A[t-k-1]) + eps[t])
                if A[t-1] == 0:
                    A[t] = bernoulli.rvs(logit_1(theta[0] + theta[1] * t + theta[2] * (L[t] - 500)), size=1)
                else:
                    A[t] = 1
                if A[t] == 1 and A[t-k] == 0: 
                    Ts = t
            lam[t] = logit_1(gam[0] + gam[1] * ((1 - A[t]) * t + A[t] * Ts) + gam[2] * A[t] + gam[3] * A[t] *(t - Ts))
            if (1 - np.prod(1 - lam)) >= U[0]:
                Y[t + 1] = 1
            else:
                Y[t + 1] = 0
        else:
            break
    
    # we only need the data before death, so whatever value t is before the end of the
    # above loop - change this to numpy array and transpose.
    Y = np.ndarray.tolist(Y[1:(t+1)])
    U = np.ndarray.tolist(U[0:t])
    L = np.ndarray.tolist(L[0:t])
    A = np.ndarray.tolist(A[0:t])
    Ts = [Ts]*t
    
    df = np.vstack((Y, L, U, A, Ts))
    df = pd.DataFrame(df.T, columns=['Y', 'L', 'U', 'A', 'Ts'])
    df['Y'] = df['Y'].astype(int)
    df['A'] = df['A'].astype(int)
    df['patid'] = patid
    df.index.name = 'visit'
    return df.reset_index()

    # return({"Y":Y, "U":U, "L":L, "A":A, "Ts":Ts, "time":[time for time in range(t)]})

In [110]:
df = sim(T, k, gam, theta)
df
#np.vstack((dd['L'], dd['A']))

Unnamed: 0,visit,Y,L,U,A,Ts,patid
0,0,0,750.176523,0.880309,0,5.0,0
1,1,0,750.176523,0.862189,0,5.0,0
2,2,0,750.176523,0.876366,0,5.0,0
3,3,0,750.176523,0.817421,0,5.0,0
4,4,0,750.176523,0.797387,0,5.0,0
5,5,0,586.681846,0.820989,1,5.0,0
6,6,0,586.681846,0.852558,1,5.0,0
7,7,0,586.681846,0.924991,1,5.0,0
8,8,0,586.681846,0.895919,1,5.0,0
9,9,0,586.681846,0.873577,1,5.0,0


In [125]:
# use sim function to make a pandas DF for n patients
import pandas as pd
n = 1000
frames = [sim(T, k, gam, theta, patid=i) for i in range(n)]
df = pd.concat(frames)

In [126]:
# make new variables for the logit regression
# including an intercept
df["d1"] = df[["visit", "Ts"]].min(axis = 1)
df["time_Ts"] = df["visit"] - df["Ts"]
df["d3"] = np.maximum(df["time_Ts"], 0)
df["intercept"] = 1.0
df = df.set_index(['patid', 'visit'])
df = df.sortlevel()

In [130]:
df.loc[(3,slice(None)),:]

Unnamed: 0_level_0,Unnamed: 1_level_0,Y,L,U,A,Ts,d1,time_Ts,d3,intercept
patid,visit,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3,0,0,466.96329,0.563086,0,10.0,0.0,-10.0,0.0,1.0
3,1,0,466.96329,0.484985,0,10.0,1.0,-9.0,0.0,1.0
3,2,0,466.96329,0.425368,0,10.0,2.0,-8.0,0.0,1.0
3,3,0,466.96329,0.401282,0,10.0,3.0,-7.0,0.0,1.0
3,4,0,466.96329,0.464106,0,10.0,4.0,-6.0,0.0,1.0
3,5,0,312.701939,0.497195,0,10.0,5.0,-5.0,0.0,1.0
3,6,0,312.701939,0.423606,0,10.0,6.0,-4.0,0.0,1.0
3,7,0,312.701939,0.452718,0,10.0,7.0,-3.0,0.0,1.0
3,8,0,312.701939,0.396068,0,10.0,8.0,-2.0,0.0,1.0
3,9,0,312.701939,0.407734,0,10.0,9.0,-1.0,0.0,1.0


In [131]:
# try a logistic model
import statsmodels.api as sm
reg = ["intercept", "d1", "A", "d3"]
logit = sm.Logit(df['Y'], df[reg])
result = logit.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.179804
         Iterations 8


0,1,2,3
Dep. Variable:,Y,No. Observations:,19308.0
Model:,Logit,Df Residuals:,19304.0
Method:,MLE,Df Model:,3.0
Date:,"Wed, 28 Jun 2017",Pseudo R-squ.:,0.09813
Time:,15:40:09,Log-Likelihood:,-3471.6
converged:,True,LL-Null:,-3849.4
,,LLR p-value:,1.9230000000000002e-163

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
intercept,-4.0469,0.103,-39.442,0.000,-4.248 -3.846
d1,-0.0116,0.007,-1.667,0.096,-0.025 0.002
A,-0.0051,0.105,-0.048,0.961,-0.211 0.201
d3,0.1092,0.004,25.578,0.000,0.101 0.118


In [160]:
# next step is to do this 100 times, store results 
# use the average from the gamma parameters, and calculate
# their standard deviations

In [None]:
# work out stabilized IPTW
# start by creating an ID variable in sim function
# should then be possible to group by individual and
# calcualte weights
# pandas function shift


In [146]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
