In [1]:
# import modules for math and distributions
from math import exp
import numpy as np
from scipy.stats import gamma, norm, uniform, bernoulli
import pandas as pd
import statsmodels.api as sm
import rpy2
import patsy

In [2]:
# load and try out R magic
%load_ext rpy2.ipython
%R X=c(1,4,5,7); sd(X); mean(X)

array([ 4.25])

In [3]:
# define inverse logit function
def logit_1(x): return(exp(x)/(1 + exp(x)))

In [4]:
# Initial set up according to HD2012
T = 40 # time periods
k = 5 # check-up times
theta = [-0.405, 0.0205, -0.00405]
gam = [-3, 0.05, -1.5, 0.1]

In [5]:
# make a function that does all of the above for an individual
def sim(T, k, gam, theta, patid=0):
    
    # define lists for holding A, L, U and Y
    A = np.zeros(T + 2) # A[-1] (last value) holds A in t = -1
    L = np.zeros(T+1)
    U = np.zeros(T+1)
    Y = np.zeros(T + 2)
    eps = np.zeros(T+1)
    lam = np.zeros(T+1) # prob of failure at each time period
    delta = np.zeros(T+1)

    # set the first value of U, U[0], to a 
    # randomly generated value from a uniform
    # distribution a measure of general health
    U[0] = uniform.rvs()
    eps[0] = norm.rvs(0, 20)
    L[0] = gamma.ppf(U[0], 3, scale=154) + eps[0]

    # set A[-1] to 0: held in last value of A
    A[-1] = 0
    
    # set A[0]
    A[0] = bernoulli.rvs(logit_1(theta[0] + theta[2] * (L[0] - 500)), size=1)
    
    if A[0] == 1:
        Ts = 0 
    else:
        Ts = -1
    
    lam[0] = logit_1(gam[0] + gam[2] * A[0])
    
    if lam[0] >= U[0]:
        Y[1] = 1
    else:
        Y[1] = 0
    # loop through each time period - stop when patient is dead or t = T + 1
    for t in range(1, T+1):
        if Y[t] == 0:
            delta[t] = norm.rvs(0, 0.05)
            U[t] = min(1, max(0, U[t-1] + delta[t]))
            if t % k != 0:
                L[t] = L[t-1]
                A[t] = A[t-1]
            else:
                eps[t] = norm.rvs(100 * (U[t] - 2), 50)
                L[t] = max(0, L[t-1] + 150 * A[t-k] * (1-A[t-k-1]) + eps[t])
                if A[t-1] == 0:
                    A[t] = bernoulli.rvs(logit_1(theta[0] + theta[1] * t + theta[2] * (L[t] - 500)), size=1)
                else:
                    A[t] = 1
                if A[t] == 1 and A[t-k] == 0: 
                    Ts = t
            ########################################
            # This is a check for debugging purposes
            # Comment it before the next push
            if Ts == -1:
                if A[t]:
                    print('There is an error...')
            ########################################
            lam[t] = logit_1(gam[0] + gam[1] * ((1 - A[t]) * t + A[t] * Ts) + gam[2] * A[t] + gam[3] * A[t] *(t - Ts))
            if (1 - np.prod(1 - lam)) >= U[0]:
                Y[t + 1] = 1
            else:
                Y[t+1] = 0
        else:
            break
    
    #we only need the data before death, so whatever value t is before the end of the
    #above loop - change this to numpy array and transpose.
    Y = np.ndarray.tolist(Y[1:(t+1)])
    U = np.ndarray.tolist(U[0:t])
    L = np.ndarray.tolist(L[0:t])
    A = np.ndarray.tolist(A[0:t])
    Ts = [Ts]*t

    df = np.vstack((Y, L, U, A, Ts))
    df = pd.DataFrame(df.T, columns=['Y', 'L', 'U', 'A', 'Ts'])
    df['Y'] = df['Y'].astype(int)
    df['A'] = df['A'].astype(int)
    df['patid'] = patid
    df.index.name = 'visit'
    return df.reset_index()

In [6]:
# use sim function to make a pandas DF for n patients
def get_sim_data(T, k, gam, theta, n = 1000):

    # get data for each of 1000 patients
    frames = [sim(T, k, gam, theta, patid=i) for i in range(n)]
    df = pd.concat(frames)
    # make new variables for the logit regression
    # including an intercept
    df["d1"] = (1-df['A'])*df['visit'] + df['A']*df['Ts']
    df["d3"] = df['A']*(df['visit']-df['Ts'])
    # df["time_Ts"] = df["visit"] - df["Ts"]
    df = df.set_index(['patid', 'visit'])
    df = df.sortlevel()
    
    #create two new variables 
    df["L_100"] = df["L"]/100
    def func(x):
        x["Lav_100"] = x["L"].mean()/100
        return x

    df = df.groupby(level="patid").apply(func)
    
    # get the previous value of A, and set first value of A_1 per patient to zero.
    df['A_1'] = df.groupby(level="patid")['A'].shift(1)
    df['A_1'] = df['A_1'].fillna(0)

    return(df)

In [7]:
#df.loc[(2,slice(None)),:] # gets patient 3
# df2.groupby('A')['A_1'].value_counts()

In [8]:
def get_weights(df):
    
    # only data when A is not 1 yet
    df["As"] = df.groupby(level="patid")['A'].cumsum()
    df2 = df[df["As"] <= 1].copy(deep=True)

    #numerator
    f = "A ~ 1"
    y, X = patsy.dmatrices(f, df2.reset_index(), return_type = "dataframe")
    n_logit = sm.Logit(y, X, missing="raise")
    n_result = n_logit.fit(maxiter=100)
    df2["pn"] = n_result.predict(X)
    
    #numerator
    f = "A ~ 1 + L"
    y, X = patsy.dmatrices(f, df2.reset_index(), return_type = "dataframe")
    d_logit = sm.Logit(y, X, missing="raise")
    d_result = d_logit.fit(maxiter=100)
    df2["pd"] = d_result.predict(X)

    # if A == 0, change probabilities to 1 - prob
    df2['pn2'] = np.where(df2['A']==0, (1 - df2["pn"]), df2["pn"])
    df2['pd2'] = np.where(df2['A']==0, (1 - df2["pd"]), df2["pd"])

    # construct stabilized weights, don't forget to group by
    df2['cpn'] = df2.groupby(df2.index.get_level_values(0))['pn2'].cumprod()
    df2['cpd'] = df2.groupby(df2.index.get_level_values(0))['pd2'].cumprod()
    df2['sw'] = df2['cpn']/df2['cpd']

    #combine df and df2
    df["sw"] = np.nan
    df.loc[df2.index, "sw"] = df2["sw"]
    df["sw"] = df["sw"].fillna(method="pad")
    
    return(df)


In [28]:
df = get_sim_data(T, k, gam, theta, n = 1000)
df = get_weights(df)
df

Optimization terminated successfully.
         Current function value: 0.431794
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.365093
         Iterations 7


Unnamed: 0_level_0,Unnamed: 1_level_0,Y,L,U,A,Ts,d1,d3,L_100,Lav_100,A_1,As,sw
patid,visit,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,0,0,461.150834,0.579033,1,0.0,0.0,0.0,4.611508,3.411770,0.0,1,0.958044
0,1,0,461.150834,0.548569,1,0.0,0.0,1.0,4.611508,3.411770,1.0,2,0.958044
0,2,0,461.150834,0.519851,1,0.0,0.0,2.0,4.611508,3.411770,1.0,3,0.958044
0,3,0,461.150834,0.487445,1,0.0,0.0,3.0,4.611508,3.411770,1.0,4,0.958044
0,4,0,461.150834,0.472798,1,0.0,0.0,4.0,4.611508,3.411770,1.0,5,0.958044
0,5,0,455.657270,0.466515,1,0.0,0.0,5.0,4.556573,3.411770,1.0,6,0.958044
0,6,0,455.657270,0.491367,1,0.0,0.0,6.0,4.556573,3.411770,1.0,7,0.958044
0,7,0,455.657270,0.495106,1,0.0,0.0,7.0,4.556573,3.411770,1.0,8,0.958044
0,8,0,455.657270,0.486030,1,0.0,0.0,8.0,4.556573,3.411770,1.0,9,0.958044
0,9,0,455.657270,0.465983,1,0.0,0.0,9.0,4.556573,3.411770,1.0,10,0.958044


In [40]:
%Rpush df
%R co = coef(glm(Y ~ 1 + d1 + A + d3, data=df, weights=sw, family=binomial(link = "logit")))
%Rpull co
co.round(2)

array([-2.49,  0.07, -2.39,  0.11])

In [9]:
# MC 
B = 20 # replications
MC1 = np.zeros((B, 4))
MC2 = np.zeros((B, 5))
MC3 = np.zeros((B, 5))
MC4 = np.zeros((B, 4))

for i in range(B):
    
    df = get_sim_data(T, k, gam, theta, n = 1000)
    df = get_weights(df)
    
    f = "Y ~ 1 + d1 + A + d3"
    y, X = patsy.dmatrices(f, df.reset_index(), return_type = "dataframe")
    mod = sm.Logit(y, X, missing="raise")
    MC1[i] = mod.fit().params.values

    f = "Y ~ 1 + d1 + A + d3 + L_100"
    y, X = patsy.dmatrices(f, df.reset_index(), return_type = "dataframe")
    mod = sm.Logit(y, X, missing="raise")
    MC2[i] = mod.fit().params.values

    f = "Y ~ 1 + d1 + A + d3 + Lav_100"
    y, X = patsy.dmatrices(f, df.reset_index(), return_type = "dataframe")
    mod = sm.Logit(y, X, missing="raise")
    MC3[i] = mod.fit().params.values

Optimization terminated successfully.
         Current function value: 0.411402
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.361198
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.186492
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.143035
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.150410
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.430230
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.369121
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.177570
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.137751
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.143787
  

In [20]:
print(MC1.mean(axis = 0))
print(MC1.std(axis = 0))
print(MC2.mean(axis = 0))
print(MC2.std(axis = 0))
print(MC3.mean(axis = 0))
print(MC3.std(axis = 0))

[-3.79505054  0.00917723 -0.36793037  0.11162895]
[ 0.09769398  0.00668455  0.10991209  0.00419201]
[-0.26919027  0.07358619 -1.86286228  0.0949171  -0.97318192]
[ 0.09807207  0.00719151  0.12189588  0.00594114  0.02214489]
[-0.28823776  0.22986858 -2.31606455  0.28333585 -1.36335958]
[ 0.140675    0.00986114  0.12962284  0.0069124   0.0453915 ]
