In [1]:
# import modules for math and distributions
from math import exp
import numpy as np
from scipy.stats import gamma, norm, uniform, bernoulli
import pandas as pd
import statsmodels.api as sm
import rpy2
import patsy

In [2]:
# load and try out R magic
%load_ext rpy2.ipython
%R X=c(1,4,5,7); sd(X); mean(X)

array([ 4.25])

In [3]:
# define inverse logit function
def logit_1(x): return(exp(x)/(1 + exp(x)))

In [4]:
# Initial set up according to HD2012
T = 40 # time periods
k = 5 # check-up times
theta = [-0.405, 0.0205, -0.00405]
gam = [-3, 0.05, -1.5, 0.1]

In [5]:
# make a function that does all of the above for an individual
def sim(T, k, gam, theta, patid=0):
    
    # define lists for holding A, L, U and Y
    A = np.zeros(T + 2) # A[-1] (last value) holds A in t = -1
    L = np.zeros(T+1)
    U = np.zeros(T+1)
    Y = np.zeros(T + 2)
    eps = np.zeros(T+1)
    lam = np.zeros(T+1) # prob of failure at each time period
    delta = np.zeros(T+1)

    # set the first value of U, U[0], to a 
    # randomly generated value from a uniform
    # distribution a measure of general health
    U[0] = uniform.rvs()
    eps[0] = norm.rvs(0, 20)
    L[0] = gamma.ppf(U[0], 3, scale=154) + eps[0]

    # set A[-1] to 0: held in last value of A
    A[-1] = 0
    
    # set A[0]
    A[0] = bernoulli.rvs(logit_1(theta[0] + theta[2] * (L[0] - 500)), size=1)
    
    if A[0] == 1:
        Ts = 0 
    else:
        Ts = -1
    
    lam[0] = logit_1(gam[0] + gam[2] * A[0])
    
    if lam[0] >= U[0]:
        Y[1] = 1
    else:
        Y[1] = 0
    # loop through each time period - stop when patient is dead or t = T + 1
    for t in range(1, T+1):
        if Y[t] == 0:
            delta[t] = norm.rvs(0, 0.05)
            U[t] = min(1, max(0, U[t-1] + delta[t]))
            if t % k != 0:
                L[t] = L[t-1]
                A[t] = A[t-1]
            else:
                eps[t] = norm.rvs(100 * (U[t] - 2), 50)
                L[t] = max(0, L[t-1] + 150 * A[t-k] * (1-A[t-k-1]) + eps[t])
                if A[t-1] == 0:
                    A[t] = bernoulli.rvs(logit_1(theta[0] + theta[1] * t + theta[2] * (L[t] - 500)), size=1)
                else:
                    A[t] = 1
                if A[t] == 1 and A[t-k] == 0: 
                    Ts = t
            ########################################
            # This is a check for debugging purposes
            # Comment it before the next push
            if Ts == -1:
                if A[t]:
                    print('There is an error...')
            ########################################
            lam[t] = logit_1(gam[0] + gam[1] * ((1 - A[t]) * t + A[t] * Ts) + gam[2] * A[t] + gam[3] * A[t] *(t - Ts))
            if (1 - np.prod(1 - lam)) >= U[0]:
                Y[t + 1] = 1
            else:
                Y[t+1] = 0
        else:
            break
    
    #we only need the data before death, so whatever value t is before the end of the
    #above loop - change this to numpy array and transpose.
    Y = np.ndarray.tolist(Y[1:(t+1)])
    U = np.ndarray.tolist(U[0:t])
    L = np.ndarray.tolist(L[0:t])
    A = np.ndarray.tolist(A[0:t])
    Ts = [Ts]*t

    df = np.vstack((Y, L, U, A, Ts))
    df = pd.DataFrame(df.T, columns=['Y', 'L', 'U', 'A', 'Ts'])
    df['Y'] = df['Y'].astype(int)
    df['A'] = df['A'].astype(int)
    df['patid'] = patid
    df.index.name = 'visit'
    return df.reset_index()

In [6]:
# use sim function to make a pandas DF for n patients
n = 5000
frames = [sim(T, k, gam, theta, patid=i) for i in range(n)]
df = pd.concat(frames)

In [7]:
# make new variables for the logit regression

##############################################
# The old code uses a mathematical simplification
# that holds only if Ts has a defined value (!= -1)
##############################################
# df["d1"] = df[["visit", "Ts"]].min(axis = 1)
# df["time_Ts"] = df["visit"] - df["Ts"]
# df["d3"] = np.maximum(df["time_Ts"], 0)
##############################################
# The following is always valid
# because when Ts == -1 then A == 0
df["d1"] = (1-df['A'])*df['visit'] + df['A']*df['Ts']
df["d3"] = df['A']*(df['visit']-df['Ts'])
##############################################

#####################################
# This is not needed if you use patsy
#####################################
# including an intercept
#df["intercept"] = 1.0
#####################################

df = df.set_index(['patid', 'visit'])
df = df.sort_index()

In [8]:
#df.loc[(2,slice(None)),:] # gets patient 3

In [9]:
# get the previous value of A, and set first value of A_1 per patient to zero.
df['A_1'] = df.groupby(level="patid")['A'].shift(1)
df['A_1'] = df['A_1'].fillna(0)

In [11]:
f = "Y ~ d1 + A + d3"
y, X = patsy.dmatrices(f, df.reset_index(), return_type = "dataframe")
n_logit = sm.Logit(y, X, missing="raise")
n_result = n_logit.fit(maxiter=300)
n_result.summary()

Optimization terminated successfully.
         Current function value: 0.181577
         Iterations 8


0,1,2,3
Dep. Variable:,Y,No. Observations:,96935.0
Model:,Logit,Df Residuals:,96931.0
Method:,MLE,Df Model:,3.0
Date:,"Wed, 12 Jul 2017",Pseudo R-squ.:,0.08648
Time:,15:58:02,Log-Likelihood:,-17601.0
converged:,True,LL-Null:,-19267.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.6742,0.044,-83.620,0.000,-3.760,-3.588
d1,0.0074,0.003,2.785,0.005,0.002,0.013
A,-0.5273,0.054,-9.820,0.000,-0.633,-0.422
d3,0.1137,0.002,52.561,0.000,0.109,0.118


In [12]:
# logistic regression model to get parameters to calcualte weights
# for both the denominator (d) and the numerator (n)
df["As"] = df.groupby(level="patid")['A'].cumsum()
df2 = df[df["As"] <= 1].copy(deep=True)
df2 = df2.reset_index()
# df2.groupby('A')['A_1'].value_counts()

#numerator
fn = "Y ~ C(visit)"
y, X = patsy.dmatrices(fn, df2, return_type = "dataframe")
# n_reg = ["intercept"]
# n_logit = sm.Logit(df2['A'], df2[n_reg])
n_logit = sm.Logit(y, X, missing="raise")
n_result = n_logit.fit(maxiter=300)
n_result.summary()


# # denominator
# d_reg = ["intercept", "L"]
# d_logit = sm.Logit(df2['A'], df2[d_reg])
# d_result = d_logit.fit(maxiter=100)
# d_result.summary()

# # numerator and denominator probabailities
# df2["pn"] = n_result.predict(df2[n_reg])
# df2["pd"] = d_result.predict(df2[d_reg])

# # if A == 0, change probabilities to 1 - prob
# df2['pn2'] = np.where(df2['A']==0, (1 - df2["pn"]), df2["pn"])
# df2['pd2'] = np.where(df2['A']==0, (1 - df2["pd"]), df2["pd"])

# # construct stabilized weights, don't forget to group by
# df2['cpn'] = df2.groupby(df2.index.get_level_values(0))['pn2'].cumprod()
# df2['cpd'] = df2.groupby(df2.index.get_level_values(0))['pd2'].cumprod()
# df2['sw'] = df2['cpn']/df2['cpd']

# #combine df and df2
# df["sw"] = np.nan
# df["sw2"] = np.nan
# df.loc[df2.index, "sw"] = df2["sw"]
# df["sw"] = df["sw"].fillna(method="pad")

# # try terminal weight
# def func(x):
#     x["sw2"] = x.iloc[-1]["sw"]
#     return x

# df = df.groupby(level="patid").apply(func)
# df.loc[(4,slice(None)),:]

         Current function value: 0.113021
         Iterations: 300




0,1,2,3
Dep. Variable:,Y,No. Observations:,29239.0
Model:,Logit,Df Residuals:,29199.0
Method:,MLE,Df Model:,39.0
Date:,"Wed, 12 Jul 2017",Pseudo R-squ.:,0.01418
Time:,16:03:35,Log-Likelihood:,-3304.6
converged:,False,LL-Null:,-3352.2
,,LLR p-value:,1.385e-06

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.8918,0.101,-38.527,0.000,-4.090,-3.694
C(visit)[T.1],0.4547,0.151,3.001,0.003,0.158,0.752
C(visit)[T.2],0.2448,0.162,1.511,0.131,-0.073,0.562
C(visit)[T.3],0.4749,0.153,3.102,0.002,0.175,0.775
C(visit)[T.4],0.4539,0.155,2.921,0.003,0.149,0.759
C(visit)[T.5],-0.5807,0.222,-2.621,0.009,-1.015,-0.146
C(visit)[T.6],0.1322,0.213,0.620,0.535,-0.286,0.550
C(visit)[T.7],0.1199,0.216,0.555,0.579,-0.304,0.544
C(visit)[T.8],0.1791,0.213,0.839,0.401,-0.239,0.597


In [13]:
# The problem of the regression is that at certain cycles you have too few 
df3 = df2.set_index(['patid', 'visit'])
df3 = df3.sort_index()
for i in [33, 36, 37]:
    tmp = df3.loc[(slice(None, i)), 'Y']
    print('Cycle {:d}: {:.3f}% events'.format(i, sum(tmp)/len(tmp)))

Cycle 33: 0.037% events
Cycle 36: 0.032% events
Cycle 37: 0.035% events


In [None]:
# pushes variables from python to rpy2
%Rpush df
%R head(df)

In [None]:
%R mod = glm(Y ~ d1 + A + d3, data =df, family = binomial("logit"))
%R mod_sw = glm(Y ~ d1 + A + d3, weights = sw, data =df, family = binomial("logit"))
%R mod_sw2 = glm(Y ~ d1 + A + d3, weights = sw2, data =df, family = binomial("logit"))
%Rpull mod
%Rpull mod_sw
%Rpull mod_sw2

In [None]:
print(mod)
print(mod_sw)
print(mod_sw2)

In [None]:
df.reset.index
patsy - formulas like in R