In [1]:
# import modules for math and distributions
from math import exp
import numpy as np
from scipy.stats import gamma, norm, uniform, bernoulli
import pandas as pd
import statsmodels.api as sm
import rpy2
import patsy

  from pandas.core import datetools


In [2]:
# load and try out R magic
%load_ext rpy2.ipython
%R X=c(1,4,5,7); sd(X); mean(X)

array([ 4.25])

In [3]:
# define inverse logit function
def logit_1(x): return(exp(x)/(1 + exp(x)))

In [4]:
# Initial set up according to HD2012
T = 40 # time periods
k = 5 # check-up times
theta = [-0.405, 0.0205, -0.00405]
gam = [-3, 0.05, -1.5, 0.1]

In [5]:
# make a function that does all of the above for an individual
def sim(T, k, gam, theta, patid=0):
    
    # define lists for holding A, L, U and Y
    A = -1*np.ones(T + 2) # A[-1] (last value) holds A in t = -1
    L = np.zeros(T+1)
    U = np.zeros(T+1)
    Y = -1*np.ones(T + 2)
    eps = np.zeros(T+1)
    lam = np.zeros(T+1) # prob of failure at each time period
    delta = np.zeros(T+1)

    # set the first value of U, U[0], to a 
    # randomly generated value from a uniform
    # distribution a measure of general health
    U[0] = uniform.rvs()
    eps[0] = norm.rvs(0, 20)
    L[0] = gamma.ppf(U[0], 3, scale=154) + eps[0]
    # L[0] = max(0, gamma.ppf(U[0], 3, scale=154) + eps[0])

    # set A[-1] to 0: held in last value of A
    A[-1] = 0
    Y[0] = 0
    
    # set A[0]
    A[0] = bernoulli.rvs(logit_1(theta[0] + theta[2] * (L[0] - 500)), size=1)
    
    if A[0] == 1:
        Ts = 0 
    else:
        Ts = -1
    
    lam[0] = logit_1(gam[0] + gam[2] * A[0])
    
    if lam[0] >= U[0]:
        Y[1] = 1
    else:
        Y[1] = 0
    # loop through each time period - stop when patient is dead or t = T + 1
    for t in range(1, T+1):
        if Y[t] == 0:
            delta[t] = norm.rvs(0, 0.05)
            U[t] = min(1, max(0, U[t-1] + delta[t]))
            if t % k != 0:
                L[t] = L[t-1]
                A[t] = A[t-1]
            else:
                eps[t] = norm.rvs(100 * (U[t] - 2), 50)
                L[t] = max(0, L[t-1] + 150 * A[t-k] * (1-A[t-k-1]) + eps[t])
                if A[t-1] == 0:
                    A[t] = bernoulli.rvs(logit_1(theta[0] + theta[1] * t + theta[2] * (L[t] - 500)), size=1)
                else:
                    A[t] = 1
                if A[t] == 1 and A[t-k] == 0: 
                    Ts = t
            ########################################
            # This is a check for debugging purposes
            # Comment it before the next push
            if Ts == -1:
                if A[t]:
                    print('There is an error...')
            ########################################
            lam[t] = logit_1(gam[0] + gam[1] * ((1 - A[t]) * t + A[t] * Ts) + gam[2] * A[t] + gam[3] * A[t] *(t - Ts))
            if (1 - np.prod(1 - lam)) >= U[0]:
                Y[t + 1] = 1
            else:
                Y[t+1] = 0
        else:
            break
    
    #we only need the data before death, so whatever value t is before the end of the
    #above loop - change this to numpy array and transpose.
    Y = np.ndarray.tolist(Y[1:(t+1)])
    U = np.ndarray.tolist(U[0:t])
    L = np.ndarray.tolist(L[0:t])
    A = np.ndarray.tolist(A[0:t])
    Ts = [Ts]*t

    df = np.vstack((Y, L, U, A, Ts))
    df = pd.DataFrame(df.T, columns=['Y', 'L', 'U', 'A', 'Ts'])
    df['Y'] = df['Y'].astype(int)
    df['A'] = df['A'].astype(int)
    df['patid'] = patid
    df.index.name = 'visit'
    return df.reset_index()

In [6]:
# use sim function to make a pandas DF for n patients
def get_sim_data(T, k, gam, theta, n = 1000):

    # get data for each of 1000 patients
    frames = [sim(T, k, gam, theta, patid=i) for i in range(n)]
    df = pd.concat(frames)
    # make new variables for the logit regression
    # including an intercept
    df["d1"] = (1-df['A'])*df['visit'] + df['A']*df['Ts']
    df["d3"] = df['A']*(df['visit']-df['Ts'])
    # df["time_Ts"] = df["visit"] - df["Ts"]
    df = df.set_index(['patid', 'visit'])
    df = df.sort_index()
    
    #create two new variables 
    df["L_100"] = df["L"]/100
    def func(x):
        x["Lav_100"] = x["L"].mean()/100
        return x

    df = df.groupby(level="patid").apply(func)
    
    # get the previous value of A, and set first value of A_1 per patient to zero.
    df['A_1'] = df.groupby(level="patid")['A'].shift(1)
    df['A_1'] = df['A_1'].fillna(0)

    return(df)

In [7]:
#df.loc[(2,slice(None)),:] # gets patient 3
# df2.groupby('A')['A_1'].value_counts()

In [8]:
def get_weights(df):
    
    # only data when A is not 1 yet
    df["As"] = df.groupby(level="patid")['A'].cumsum()
    df2 = df[df["As"] <= 1].copy(deep=True)

    #numerator
    f = "A ~ 1"
    y, X = patsy.dmatrices(f, df2.reset_index(), return_type = "dataframe")
    n_logit = sm.Logit(y, X, missing="raise")
    n_result = n_logit.fit(disp=0, maxiter=100)
    df2["pn"] = n_result.predict()
    
    #numerator
    f = "A ~ L"
    y, X = patsy.dmatrices(f, df2.reset_index(), return_type = "dataframe")
    d_logit = sm.Logit(y, X, missing="raise")
    d_result = d_logit.fit(disp=0, maxiter=100)
    df2["pd"] = d_result.predict()

    # if A == 0, change probabilities to 1 - prob
    df2['pn2'] = np.where(df2['A']==0, (1 - df2["pn"]), df2["pn"])
    df2['pd2'] = np.where(df2['A']==0, (1 - df2["pd"]), df2["pd"])

    # construct stabilized weights, don't forget to group by
    df2['cpn'] = df2.groupby(level=0)['pn2'].cumprod()
    df2['cpd'] = df2.groupby(level=0)['pd2'].cumprod()
    df2['sw'] = df2['cpn']/df2['cpd']

    #combine df and df2
    df["sw"] = np.nan
    df.loc[df2.index, "sw"] = df2["sw"]
    df["sw"] = df["sw"].fillna(method="pad")
    
    return(df)


In [9]:
df = get_sim_data(T, k, gam, theta, n = 1000)
df = get_weights(df)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Y,L,U,A,Ts,d1,d3,L_100,Lav_100,A_1,As,sw
patid,visit,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,0,0,383.314624,0.369823,0,-1.0,0.0,0.0,3.833146,3.291834,0.0,0,1.064680
0,1,0,383.314624,0.348572,0,-1.0,1.0,0.0,3.833146,3.291834,0.0,0,1.133543
0,2,0,383.314624,0.347133,0,-1.0,2.0,0.0,3.833146,3.291834,0.0,0,1.206860
0,3,0,383.314624,0.368752,0,-1.0,3.0,0.0,3.833146,3.291834,0.0,0,1.284920
0,4,0,383.314624,0.414795,0,-1.0,4.0,0.0,3.833146,3.291834,0.0,0,1.368028
0,5,0,238.964646,0.427428,0,-1.0,5.0,0.0,2.389646,3.291834,0.0,0,1.777708
0,6,0,238.964646,0.409380,0,-1.0,6.0,0.0,2.389646,3.291834,0.0,0,2.310074
0,7,1,238.964646,0.402440,0,-1.0,7.0,0.0,2.389646,3.291834,0.0,0,3.001867
1,0,0,808.096058,0.901819,0,20.0,0.0,-0.0,8.080961,4.297513,0.0,0,0.867932
1,1,0,808.096058,0.999558,0,20.0,1.0,-0.0,8.080961,4.297513,0.0,0,0.753306


In [10]:
tmp = df.reset_index()
tmp.head()

Unnamed: 0,patid,visit,Y,L,U,A,Ts,d1,d3,L_100,Lav_100,A_1,As,sw
0,0,0,0,383.314624,0.369823,0,-1.0,0.0,0.0,3.833146,3.291834,0.0,0,1.06468
1,0,1,0,383.314624,0.348572,0,-1.0,1.0,0.0,3.833146,3.291834,0.0,0,1.133543
2,0,2,0,383.314624,0.347133,0,-1.0,2.0,0.0,3.833146,3.291834,0.0,0,1.20686
3,0,3,0,383.314624,0.368752,0,-1.0,3.0,0.0,3.833146,3.291834,0.0,0,1.28492
4,0,4,0,383.314624,0.414795,0,-1.0,4.0,0.0,3.833146,3.291834,0.0,0,1.368028


In [11]:
%%R -i tmp
library(ipw)
library(survey)
ipwsw <- ipwtm(exposure = A, family = "binomial", link = 'logit',
               numerator = ~ 1, denominator = ~ L,
               id = patid, timevar = visit,
               type = "first", data = tmp)
tmp$ipwsw <- ipwsw$ipw.weights
desipw <- svydesign(ids = ~ 1, data = tmp, weights = ~ ipwsw)
mdl <- svyglm(Y ~ d1 + A + d3, design = desipw, family = quasibinomial())
co <- coef(mdl)
print(summary(mdl))




Attaching package: ‘survey’



    dotchart





Call:
svyglm(formula = Y ~ d1 + A + d3, design = desipw, family = quasibinomial())

Survey design:
svydesign(ids = ~1, data = tmp, weights = ~ipwsw)

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -2.59167    0.10613 -24.420  < 2e-16 ***
d1           0.11466    0.01685   6.806 1.04e-11 ***
A           -2.43241    0.20997 -11.585  < 2e-16 ***
d3           0.10343    0.01092   9.475  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for quasibinomial family taken to be 1.032988)

Number of Fisher Scoring iterations: 6



In [12]:
# MC 
B = 50 # replications
MC1 = np.zeros((B, 4))
# MC2 = np.zeros((B, 5))
# MC3 = np.zeros((B, 5))
# MC4 = np.zeros((B, 4))

for i in range(B):
    
    df = get_sim_data(T, k, gam, theta, n = 1000)
    df = get_weights(df)
    
    f = "Y ~ 1 + d1 + A + d3"
    y, X = patsy.dmatrices(f, df.reset_index(), return_type = "dataframe")
    mod = sm.Logit(y, X, missing="raise")
    MC1[i] = mod.fit(disp=0).params.values

#     f = "Y ~ 1 + d1 + A + d3 + L_100"
#     y, X = patsy.dmatrices(f, df.reset_index(), return_type = "dataframe")
#     mod = sm.Logit(y, X, missing="raise")
#     MC2[i] = mod.fit().params.values

#     f = "Y ~ 1 + d1 + A + d3 + Lav_100"
#     y, X = patsy.dmatrices(f, df.reset_index(), return_type = "dataframe")
#     mod = sm.Logit(y, X, missing="raise")
#     MC3[i] = mod.fit().params.values

In [13]:
print(MC1.mean(axis = 0))
print(MC1.std(axis = 0))
# print(MC2.mean(axis = 0))
# print(MC2.std(axis = 0))
# print(MC3.mean(axis = 0))
# print(MC3.std(axis = 0))

[-3.77654063  0.00826751 -0.40855474  0.11317261]
[ 0.096241    0.00660266  0.09496487  0.00515728]
