In [1]:
# import modules for math and distributions
from math import exp
import numpy as np
from scipy.stats import gamma, norm, uniform, bernoulli
import pandas as pd
import statsmodels.api as sm
import rpy2

In [2]:
# load and try out R magic
%load_ext rpy2.ipython
%R X=c(1,4,5,7); sd(X); mean(X)

array([ 4.25])

In [3]:
# define inverse logit function
def logit_1(x): return(exp(x)/(1 + exp(x)))

In [4]:
# Initial set up according to HD2012
T = 40 # time periods
k = 5 # check-up times
theta = [-0.405, 0.0205, -0.00405]
gam = [-3, 0.05, -1.5, 0.1]

In [20]:
# make a function that does all of the above for an individual
def sim(T, k, gam, theta, patid=0):
    
    # define lists for holding A, L, U and Y
    A = np.zeros(T + 2) # A[-1] (last value) holds A in t = -1
    L = np.zeros(T+1)
    U = np.zeros(T+1)
    Y = np.zeros(T + 2)
    eps = np.zeros(T+1)
    lam = np.zeros(T+1) # prob of failure at each time period
    delta = np.zeros(T+1)

    # set the first value of U, U[0], to a 
    # randomly generated value from a uniform
    # distribution a measure of general health
    U[0] = uniform.rvs()
    eps[0] = norm.rvs(0, 20)
    L[0] = gamma.ppf(U[0], 3, scale=154) + eps[0]

    # set A[-1] to 0: held in last value of A
    A[-1] = 0
    
    # set A[0]
    A[0] = bernoulli.rvs(logit_1(theta[0] + theta[2] * (L[0] - 500)), size=1)
    
    if A[0] == 1:
        Ts = 0 
    else:
        Ts = T + 20
    
    lam[0] = logit_1(gam[0] + gam[2] * A[0])
    
    if lam[0] >= U[0]:
        Y[1] = 1
    else:
        Y[1] = 0
    # loop through each time period - stop when patient is dead or t = T + 1
    for t in range(1, T+1):
        if Y[t] == 0:
            delta[t] = norm.rvs(0, 0.05)
            U[t] = min(1, max(0, U[t-1] + delta[t]))
            if t % k != 0:
                L[t] = L[t-1]
                A[t] = A[t-1]
            else:
                eps[t] = norm.rvs(100 * (U[t] - 2), 50)
                L[t] = max(0, L[t-1] + 150 * A[t-k] * (1-A[t-k-1]) + eps[t])
                if A[t-1] == 0:
                    A[t] = bernoulli.rvs(logit_1(theta[0] + theta[1] * t + theta[2] * (L[t] - 500)), size=1)
                else:
                    A[t] = 1
                if A[t] == 1 and A[t-k] == 0: 
                    Ts = t
            lam[t] = logit_1(gam[0] + gam[1] * ((1 - A[t]) * t + A[t] * Ts) + gam[2] * A[t] + gam[3] * A[t] *(t - Ts))
            if (1 - np.prod(1 - lam)) >= U[0]:
                Y[t + 1] = 1
            else:
                Y[t+1] = 0
        else:
            break
    
    #we only need the data before death, so whatever value t is before the end of the
    #above loop - change this to numpy array and transpose.
    Y = np.ndarray.tolist(Y[1:(t+1)])
    U = np.ndarray.tolist(U[0:t])
    L = np.ndarray.tolist(L[0:t])
    A = np.ndarray.tolist(A[0:t])
    Ts = [Ts]*t

    df = np.vstack((Y, L, U, A, Ts))
    df = pd.DataFrame(df.T, columns=['Y', 'L', 'U', 'A', 'Ts'])
    df['Y'] = df['Y'].astype(int)
    df['A'] = df['A'].astype(int)
    df['patid'] = patid
    df.index.name = 'visit'
    return df.reset_index()

In [21]:
# use sim function to make a pandas DF for n patients
import pandas as pd
n = 1000
frames = [sim(T, k, gam, theta, patid=i) for i in range(n)]
df = pd.concat(frames)

In [22]:
# make new variables for the logit regression
# including an intercept
df["d1"] = df[["visit", "Ts"]].min(axis = 1)
df["time_Ts"] = df["visit"] - df["Ts"]
df["d3"] = np.maximum(df["time_Ts"], 0)
df["intercept"] = 1.0
df = df.set_index(['patid', 'visit'])
df = df.sortlevel()

In [8]:
#df.loc[(2,slice(None)),:] # gets patient 3

In [23]:
# get the previous value of A, and set first value of A_1 per patient to zero.
df['A_1'] = df.groupby(level="patid")['A'].shift(1)
df['A_1'] = df['A_1'].fillna(0)

In [24]:
# logistic regression model to get parameters to calcualte weights
# for both the denominator (d) and the numerator (n)
df["As"] = df.groupby(level="patid")['A'].cumsum()
df2 = df[df["As"] <= 1].copy(deep=True)
# df2.groupby('A')['A_1'].value_counts()

#numerator
n_reg = ["intercept"]
n_logit = sm.Logit(df2['A'], df2[n_reg])
n_result = n_logit.fit(maxiter=100)
n_result.summary()

# denominator
d_reg = ["intercept", "L"]
d_logit = sm.Logit(df2['A'], df2[d_reg])
d_result = d_logit.fit(maxiter=100)
d_result.summary()

# numerator and denominator probabailities
df2["pn"] = n_result.predict(df2[n_reg])
df2["pd"] = d_result.predict(df2[d_reg])

# if A == 0, change probabilities to 1 - prob
df2['pn2'] = np.where(df2['A']==0, (1 - df2["pn"]), df2["pn"])
df2['pd2'] = np.where(df2['A']==0, (1 - df2["pd"]), df2["pd"])

# construct stabilized weights, don't forget to group by
df2['cpn'] = df2.groupby(df2.index.get_level_values(0))['pn2'].cumprod()
df2['cpd'] = df2.groupby(df2.index.get_level_values(0))['pd2'].cumprod()
df2['sw'] = df2['cpn']/df2['cpd']

#combine df and df2
df["sw"] = np.nan
df["sw2"] = np.nan
df.loc[df2.index, "sw"] = df2["sw"]
df["sw"] = df["sw"].fillna(method="pad")

# try terminal weight
def func(x):
    x["sw2"] = x.iloc[-1]["sw"]
    return x

df = df.groupby(level="patid").apply(func)
df.loc[(4,slice(None)),:]

KeyError: "['visit'] not in index"

In [11]:
# pushes variables from python to rpy2
%Rpush df
%R head(df)

Unnamed: 0,Y,L,U,A,Ts,d1,time_Ts,d3,intercept,A_1,As,sw,sw2
"(0, 0)",0,330.385845,0.378685,0,5.0,0.0,-5.0,0.0,1.0,0.0,0,1.128333,0.692056
"(0, 1)",0,330.385845,0.518255,0,5.0,1.0,-4.0,0.0,1.0,0.0,0,1.273135,0.692056
"(0, 2)",0,330.385845,0.514588,0,5.0,2.0,-3.0,0.0,1.0,0.0,0,1.43652,0.692056
"(0, 3)",0,330.385845,0.513617,0,5.0,3.0,-2.0,0.0,1.0,0.0,0,1.620872,0.692056
"(0, 4)",0,330.385845,0.498619,0,5.0,4.0,-1.0,0.0,1.0,0.0,0,1.828884,0.692056
"(0, 5)",0,204.66354,0.452771,1,5.0,5.0,0.0,0.0,1.0,0.0,1,0.692056,0.692056


In [12]:
%R mod = glm(Y ~ d1 + A + d3, data =df, family = binomial("logit"))
%R mod_sw = glm(Y ~ d1 + A + d3, weights = sw, data =df, family = binomial("logit"))
%R mod_sw2 = glm(Y ~ d1 + A + d3, weights = sw2, data =df, family = binomial("logit"))
%Rpull mod
%Rpull mod_sw
%Rpull mod_sw2

In [13]:
print(mod)
print(mod_sw)
print(mod_sw2)



Call:  glm(formula = Y ~ d1 + A + d3, family = binomial(logit), data = df)



Coefficients:

(Intercept)           d1            A           d3  

  -3.714803     0.004844    -0.312654     0.109301  



Degrees of Freedom: 19023 Total (i.e. Null);  19020 Residual

Null Deviance:	    7669 

Residual Deviance: 7036 	AIC: 7044



Call:  glm(formula = Y ~ d1 + A + d3, family = binomial(logit), data = df, 

    weights = sw)



Coefficients:

(Intercept)           d1            A           d3  

   -2.51496      0.06424     -2.32724      0.10165  



Degrees of Freedom: 19023 Total (i.e. Null);  19020 Residual

Null Deviance:	    8577 

Residual Deviance: 7858 	AIC: 7764



Call:  glm(formula = Y ~ d1 + A + d3, family = binomial(logit), data = df, 

    weights = sw2)



Coefficients:

(Intercept)           d1            A           d3  

   -2.70494      0.06885     -2.15977      0.10207  



Degrees of Freedom: 19023 Total (i.e. Null);  19020 Residual

Null D