In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import datetime
import scipy.stats as stats

#graphing
import matplotlib.pyplot as plt
#stats
import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel
from sklearn.mixture import GaussianMixture 
#import testing
import sys
sys.path.append("../")
import selection_tests

In [2]:
data = pd.read_stata("all_plans_c.dta")
data['log_enroll'] = np.log(data['enr_c'])
data['enr_total'] = data['enr_c'] + data['enr_FFS']
data = data[data['year'] > 2011]

In [3]:
#https://www.statsmodels.org/dev/examples/notebooks/generated/glm_weights.html

def drop_data(data,y_name,x_name,absorb=None):
    data = data.copy()
    data = data[y_name + x_name + absorb]
    missing_vals = ~data.isnull().max(axis=1)
    data = data[missing_vals]
    data = data[data['year'].groupby(data['ssa']).transform('count')>=11]
    return data
    

def demean(y_name,x_name,data=None,absorb=None,cluster=None): 

    y,X = data[ y_name], data[ x_name ]
    
    y_dot = y.copy()
    X_dot = X.copy()
    
    ybar = y.mean()
    Xbar = X.mean()

    
    for effect in absorb:
        y_dot = y_dot - y.groupby(data[effect]).transform('mean')
        X_dot = X_dot - X.groupby(data[effect]).transform('mean') 
    
    y_dot = y_dot + ybar
    X_dot = X_dot + Xbar
    return y_dot, X_dot




In [4]:
class GLS_LL(GenericLikelihoodModel):

    def __init__(self, *args, model=None, **kwargs):
        super(GLS_LL, self).__init__(*args, **kwargs)
        self.model = model
        
    def loglikeobs(self, params, scale=None):
        """
        Evaluate the log-likelihood for a generalized linear model.
        """
        model = self.model
        scale = sm.tsa.stattools.float_like(scale, "scale", optional=True)
        lin_pred = np.dot(model.exog, params) + model._offset_exposure
        expval = model.family.link.inverse(lin_pred)
        if scale is None:
            scale = model.estimate_scale(expval)
        llf = model.family.loglike_obs(model.endog, expval, model.var_weights,
                                  scale)
        return llf

    
def print_results(y_name,data,weights=True):
    
    x_name = ['bmFFS','post_bmFFS', 'log_risk_pub_c','FFS_AB',"ins_parent_noSNPs",'log_risk_FFS','unemploy_rt','pc_income']
    y_dot, X_dot = demean(y_name,x_name, data=data,absorb=['ssa','year'])
    missing_vals = ~data[y_name + x_name].isnull().max(axis=1)
    y_dot, X_dot = y_dot[missing_vals],X_dot[missing_vals]
    gmmodel = GaussianMixture (n_components=2)

    #model 1 set up data
    data_stack = X_dot[['log_risk_pub_c']].copy()
    data_stack['y'] = y_dot
    classify = GaussianMixture (n_components=2).fit(data_stack)
    c = np.array(classify.predict(data_stack))

    classify = gmmodel.fit(data_stack)
    c = np.array(classify.predict(data_stack))
    
    #model 2 set up data
    X_dot2 = X_dot.copy()
    X_dot2['bmFFS*c'] = X_dot['bmFFS']*c
    

    m1,m2 = None,None
    if weights:
        weights = np.array( data['enr_c'][missing_vals] )
        m1 = sm.GLM(y_dot,X_dot,var_weights=weights)
        m2 = sm.GLM(y_dot,X_dot2,var_weights=weights)
    else:
        m1 = sm.GLM(y_dot,X_dot)
        m2 = sm.GLM(y_dot,X_dot2)
        
    #model 1
    m1_fit = m1.fit()
    print(m1_fit.summary().as_latex())
    m2_fit = m2.fit()
    print(m2_fit.summary().as_latex())

In [5]:
#this is janky as f.... need to fix it...
def setup_test(yn, xn,
    y_name = ['log_enroll'],
    x_name = ['bmFFS','log_risk_pub_c','FFS_AB',"ins_parent_noSNPs",'log_risk_FFS','unemploy_rt','pc_income']):
    
    data = xn.copy()

    #data1 = drop_data(data,y_name,x_name,absorb=['ssa','year'])
    y_dot, X_dot = demean(y_name,x_name, data=data,absorb=['ssa','year'])
    missing_vals = ~data[y_name + x_name].isnull().max(axis=1)
    y_dot, X_dot = y_dot[missing_vals],X_dot[missing_vals]

    #double check if weighting + claissifer helps
    gmmodel = GaussianMixture (n_components=2)

    #high risk score, low enrollment? much bigger benchmark effect...
    data_stack = X_dot[['log_risk_pub_c']].copy()
    data_stack['y'] = y_dot
    classify = GaussianMixture (n_components=2).fit(data_stack)
    c = np.array(classify.predict(data_stack))

    classify = gmmodel.fit(data_stack)
    c = np.array(classify.predict(data_stack))

    #model 1
    weights = np.array( data['enr_c_mean'][missing_vals] )
    m1 = sm.GLM(y_dot,X_dot)#,var_weights=weights)
    m1_fit = m1.fit()
    

    #model2
    X_dot2 = X_dot.copy()
    X_dot2['bmFFS*c'] = X_dot['bmFFS']*c
    m2 = sm.GLM(y_dot,X_dot2)#,var_weights=weights)
    m2_fit = m2.fit()

    model1 = GLS_LL(y_dot, X_dot, model=m1)
    ll1 = model1.loglikeobs(m1_fit.params)
    grad1 = model1.score_obs(m1_fit.params)
    hess1 = model1.hessian(m1_fit.params)
    params1 = m1_fit.params

    # fit logistic values
    model2 = GLS_LL(y_dot, X_dot, model=m2)
    ll2 = model2.loglikeobs(m2_fit.params)
    grad2 = model2.score_obs(m2_fit.params)
    hess2 = model2.hessian(m2_fit.params)
    params2 = m2_fit.params

    return  ll1, grad1, hess1, params1, ll2, grad2, hess2, params2


print(selection_tests.test_table(data['log_enroll'],data,setup_test))

\begin{center}
\begin{tabular}{ccccc}
\toprule
\textbf{Version} & \textbf{Result} & \textbf{90 \% CI} & \textbf{95 \% CI} & \textbf{99 \% CI} \\ \midrule
Shi (2015) & H0 & [-4.377, 0.471] & [-5.518, 1.608] & [-20.893, 16.878] \\
Classical & H2 & [-3.731, -0.441] & [-4.045, -0.127] & [-4.662, 0.490] \\
Bootstrap & H0 & [-3.810, -0.277] & [-4.175, -0.027] & [-5.005, 0.467] \\
\bottomrule
\end{tabular}
\end{center}
None


In [6]:
print_results(['bid_pub_c'],data[data['year'] >=2006],weights=True)

\begin{center}
\begin{tabular}{lclc}
\toprule
\textbf{Dep. Variable:}      &   bid\_pub\_c    & \textbf{  No. Observations:  } &    16126    \\
\textbf{Model:}              &       GLM        & \textbf{  Df Residuals:      } &    16119    \\
\textbf{Model Family:}       &     Gaussian     & \textbf{  Df Model:          } &        6    \\
\textbf{Link Function:}      &     identity     & \textbf{  Scale:             } & 7.0887e+05  \\
\textbf{Method:}             &       IRLS       & \textbf{  Log-Likelihood:    } &   -80425.   \\
\textbf{Date:}               & Thu, 15 Dec 2022 & \textbf{  Deviance:          } & 1.1426e+10  \\
\textbf{Time:}               &     18:20:42     & \textbf{  Pearson chi2:      } &  1.14e+10   \\
\textbf{No. Iterations:}     &        3         & \textbf{                     } &             \\
\bottomrule
\end{tabular}
\begin{tabular}{lcccccc}
                             & \textbf{coef} & \textbf{std err} & \textbf{z} & \textbf{P$> |$z$|$} & \textbf{[0.025} & 

# enrollment data

In [7]:
print_results(['log_enroll'],data)

\begin{center}
\begin{tabular}{lclc}
\toprule
\textbf{Dep. Variable:}      &   log\_enroll    & \textbf{  No. Observations:  } &    16126    \\
\textbf{Model:}              &       GLM        & \textbf{  Df Residuals:      } &    16119    \\
\textbf{Model Family:}       &     Gaussian     & \textbf{  Df Model:          } &        6    \\
\textbf{Link Function:}      &     identity     & \textbf{  Scale:             } &    46.261   \\
\textbf{Method:}             &       IRLS       & \textbf{  Log-Likelihood:    } &   -2720.4   \\
\textbf{Date:}               & Thu, 15 Dec 2022 & \textbf{  Deviance:          } & 7.4569e+05  \\
\textbf{Time:}               &     18:20:42     & \textbf{  Pearson chi2:      } &  7.46e+05   \\
\textbf{No. Iterations:}     &        3         & \textbf{                     } &             \\
\bottomrule
\end{tabular}
\begin{tabular}{lcccccc}
                             & \textbf{coef} & \textbf{std err} & \textbf{z} & \textbf{P$> |$z$|$} & \textbf{[0.025} & 

In [8]:
setup_test2 = lambda yn, xn : setup_test(yn,xn,y_name=['log_enroll'])

print(selection_tests.test_table(data['log_enroll'],data,setup_test2))

\begin{center}
\begin{tabular}{ccccc}
\toprule
\textbf{Version} & \textbf{Result} & \textbf{90 \% CI} & \textbf{95 \% CI} & \textbf{99 \% CI} \\ \midrule
Shi (2015) & H0 & [-4.232, 0.322] & [-5.657, 1.746] & [-16.374, 12.468] \\
Classical & H2 & [-3.731, -0.441] & [-4.045, -0.127] & [-4.662, 0.490] \\
Bootstrap & H0 & [-3.873, -0.310] & [-4.173, -0.081] & [-4.762, 0.567] \\
\bottomrule
\end{tabular}
\end{center}
None
