In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import datetime
import scipy.stats as stats

#graphing
import matplotlib.pyplot as plt
#stats
import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel
from sklearn.mixture import GaussianMixture 
#import testing
import sys
sys.path.append("../")
import selection_tests

import itertools

In [2]:
data = pd.read_stata("all_plans_c_bonus.dta")

data['log_enroll'] = np.log(data['enr_c'])
data['enr_total'] = data['enr_c'] + data['enr_FFS']
data['penetration'] = data['enr_c']/data['enr_total']
data['bmFFS_ns_diff'] = data['bm_ns'] - data['bmFFS']
#data = data[ (data['year'] >= 2006) & (data['year'] <= 2016) ]

print(data['year'].min())

#data.to_stata('all_plans_c_stata.dta')

#create a variable that is just db 
treat = data[['ssa', 'state', 'county', 'double_bonus']].copy()
treat = treat.groupby(['ssa', 'state', 'county'],as_index=False).max()
treat = treat.rename(columns={'double_bonus':'treatment' })
data = data.merge(treat, on=['ssa', 'state', 'county'],how='left')

2006


In [3]:
print(data[  (data['treatment']==0) & (data['year']<2012) ]['penetration'].mean())
print(data[  (data['treatment']==1) & (data['year']<2012) ]['penetration'].mean())

print('------------------')
print('------------------')

no_bonus_pre = data[  (data['treatment']==0) & (data['year']<2012) ]
bonus_pre = data[  (data['treatment']==1) & (data['year']<2012) ]
print( (no_bonus_pre['penetration']*no_bonus_pre['enr_total']).sum()/no_bonus_pre['enr_total'].sum() )
print( (bonus_pre['penetration']*bonus_pre['enr_total']).sum()/bonus_pre['enr_total'].sum() )

print('------------------')
print('------------------')

0.09675879776477814
0.2912343442440033
------------------
------------------
0.15043731
0.29852337
------------------
------------------


In [4]:
summary_xs1=['double_bonus', 'bm_ns','bmFFS_ns_diff','star_C2',  'ins_parent_noSNPs',  'FFS_AB', 
                           'unemploy_rt', 'pc_income', 'log_risk_FFS', 'log_risk_pub_c', 'risk_FFS',
                           'risk_pub_c', 'rebate_pub_c']

def join_print(t1,t2,
               summary_xs=summary_xs1 ):
    table =  pd.DataFrame(index=summary_xs)
    table['1'] = t1
    table['2'] = t2
    
    for row in table.itertuples():
        listrow = list(row)
        print('\\textbf{%s}'%listrow[0].replace('_','\\_'),end='')
        for i in range(len(listrow)-1):
            print(' & %.4f '%listrow[i+1],end='')
        print('\\\\')

join_print(data[summary_xs1].mean(),data[summary_xs1].std())

\textbf{double\_bonus} & 0.0339  & 0.1809 \\
\textbf{bm\_ns} & 818.4683  & 76.3584 \\
\textbf{bmFFS\_ns\_diff} & -11.2980  & 16.9155 \\
\textbf{star\_C2} & 7.6048  & 13.5177 \\
\textbf{ins\_parent\_noSNPs} & 2.9230  & 1.7621 \\
\textbf{FFS\_AB} & 714.8803  & 107.1201 \\
\textbf{unemploy\_rt} & 6.8141  & 2.9020 \\
\textbf{pc\_income} & 36.2201  & 10.2391 \\
\textbf{log\_risk\_FFS} & -4.0951  & 7.4628 \\
\textbf{log\_risk\_pub\_c} & -9.2093  & 12.9338 \\
\textbf{risk\_FFS} & 0.9625  & 0.0711 \\
\textbf{risk\_pub\_c} & 0.9196  & 0.1184 \\
\textbf{rebate\_pub\_c} & 48.7655  & 33.0495 \\


In [5]:
#https://www.statsmodels.org/dev/examples/notebooks/generated/glm_weights.html

def drop_data(data,y_name,x_name,absorb=None):
    data = data.copy()
    data = data[y_name + x_name + absorb]
    missing_vals = ~data.isnull().max(axis=1)
    data = data[missing_vals]
    data = data[data['year'].groupby(data['ssa']).transform('count')>=11]
    return data
    

def demean(y_name,x_name,data=None,absorb=None,cluster=None): 

    y,X = data[ y_name], data[ x_name ]
    
    y_dot = y.copy()
    X_dot = X.copy()
    
    ybar = y.mean()
    Xbar = X.mean()

    
    for effect in absorb:
        y_dot = y_dot - y.groupby(data[effect]).transform('mean')
        X_dot = X_dot - X.groupby(data[effect]).transform('mean')
    y_dot = y_dot + ybar
    X_dot = X_dot + Xbar
    return y_dot, X_dot

In [6]:
model1_x = ['double_bonus','bmFFS_ns_diff','star_C2','log_risk_pub_c',
            'FFS_AB',"ins_parent_noSNPs",'log_risk_FFS','unemploy_rt','pc_income']

model2_x = ['bm_ns','bmFFS_ns_diff','star_C2','log_risk_pub_c',
            'FFS_AB',"ins_parent_noSNPs",'log_risk_FFS','unemploy_rt','pc_income']
    

model_xs = [model1_x,model2_x]



def setup_data(y_name,model_xs,data):
     #get the super set of all the model names
    all_xs = set()
    for model_x in model_xs:
        all_xs = all_xs.union(set(model_x))
    all_xs = list(all_xs)
    
    #subtract out the columns that are not in the data
    x_name = []
    for col in data.columns:
        if col in all_xs:
            x_name.append(col)
    print(x_name)
    
    #clean the data
    y_dot, X_dot = demean(y_name,x_name, data=data,absorb=['ssa','year'])
    missing_vals = ~data[y_name + x_name].isnull().max(axis=1)
    y_dot, X_dot = y_dot[missing_vals],X_dot[missing_vals]
    
    return y_dot,X_dot,x_name,missing_vals



def return_results(y_name,model_xs,data,weights=True):
    y_dot,X_dot,x_name,missing_vals = setup_data(y_name,model_xs,data)
    params = []
    se = []
    for model_x in model_xs:
        if weights:
            var_weights = np.array( data['enr_total'][missing_vals] )
            X_dot_m = X_dot[model_x].copy()
            model = sm.GLM(y_dot,X_dot_m,var_weights=var_weights)
            model_fit = model.fit()
            params.append(model_fit.params)
            se.append(model_fit.bse)
    table  = pd.DataFrame(index=x_name)
    col_names = []
    for i in range(len(model_xs)):
        table['params %i'%(i+1)]  = params[i]
        table['se %i'%(i+1)]  = se[i]
    return table
    

table = return_results(['penetration'],model_xs,data)
print(table)

['ins_parent_noSNPs', 'star_C2', 'bm_ns', 'FFS_AB', 'unemploy_rt', 'pc_income', 'log_risk_FFS', 'log_risk_pub_c', 'double_bonus', 'bmFFS_ns_diff']
                   params 1      se 1  params 2      se 2
ins_parent_noSNPs  0.001379  0.000220  0.001093  0.000222
star_C2           -0.000048  0.000030 -0.000052  0.000030
bm_ns                   NaN       NaN -0.000067  0.000007
FFS_AB            -0.000080  0.000006 -0.000057  0.000007
unemploy_rt       -0.004876  0.000299 -0.004996  0.000299
pc_income         -0.000567  0.000081 -0.000854  0.000084
log_risk_FFS       0.000847  0.000122  0.000847  0.000122
log_risk_pub_c     0.000293  0.000050  0.000340  0.000050
double_bonus       0.005642  0.001024       NaN       NaN
bmFFS_ns_diff     -0.000287  0.000020 -0.000281  0.000019


In [7]:
def table_to_latex(table):
    num_col = len(table.columns)
    print('\\begin{tabular}{l',end='')
    for i in range(int(len(table.columns)/2-1)):
        print('cc|',end='')
    print('cc}')
    print('\\toprule')
    print('{}',end='')
    model = 1
    while model <= len(table.columns)/2:
        print('& \\textbf{coef %s} & \\textbf{se %s}'%(model,model),end='' )
        model = model +1 
    print('\\\\')
    print('\\midrule')

    
    for row in table.itertuples():
        listrow = list(row)
        print('\\textbf{%s}'%listrow[0].replace('_','\\_'),end='')
        for i in range(len(listrow)-1):
            print(' & %.4f '%listrow[i+1],end='')
        print('\\\\')
    print('\\bottomrule')
    print('\\end{tabular}')
    
table_to_latex(table)

\begin{tabular}{lcc|cc}
\toprule
{}& \textbf{coef 1} & \textbf{se 1}& \textbf{coef 2} & \textbf{se 2}\\
\midrule
\textbf{ins\_parent\_noSNPs} & 0.0014  & 0.0002  & 0.0011  & 0.0002 \\
\textbf{star\_C2} & -0.0000  & 0.0000  & -0.0001  & 0.0000 \\
\textbf{bm\_ns} & nan  & nan  & -0.0001  & 0.0000 \\
\textbf{FFS\_AB} & -0.0001  & 0.0000  & -0.0001  & 0.0000 \\
\textbf{unemploy\_rt} & -0.0049  & 0.0003  & -0.0050  & 0.0003 \\
\textbf{pc\_income} & -0.0006  & 0.0001  & -0.0009  & 0.0001 \\
\textbf{log\_risk\_FFS} & 0.0008  & 0.0001  & 0.0008  & 0.0001 \\
\textbf{log\_risk\_pub\_c} & 0.0003  & 0.0001  & 0.0003  & 0.0001 \\
\textbf{double\_bonus} & 0.0056  & 0.0010  & nan  & nan \\
\textbf{bmFFS\_ns\_diff} & -0.0003  & 0.0000  & -0.0003  & 0.0000 \\
\bottomrule
\end{tabular}


In [8]:
%load_ext autoreload
%autoreload 2
import selection_tests


class GLS_LL(GenericLikelihoodModel):

    def __init__(self, *args, model=None, **kwargs):
        super(GLS_LL, self).__init__(*args, **kwargs)
        self.model = model
        
    def loglikeobs(self, params, scale=None):
        """
        Evaluate the log-likelihood for a generalized linear model.
        """
        model = self.model
        scale = sm.tsa.stattools.float_like(scale, "scale", optional=True)
        lin_pred = np.dot(model.exog, params) + model._offset_exposure
        expval = model.family.link.inverse(lin_pred)
        if scale is None:
            scale = model.estimate_scale(expval)
        llf = model.family.loglike_obs(model.endog, expval, model.var_weights,
                                  scale)
        return llf



#this is janky as f.... need to fix it...
def setup_test(y_dot,X_dot,
    model1_cov = [],
    model2_cov = []):
    
    #model 1
    #weights = np.array( data['enr_c_mean'][missing_vals] )
    m1 = sm.GLM(y_dot,X_dot[model1_cov])#,var_weights=weights)
    m1_fit = m1.fit()

    #model2
    m2 = sm.GLM(y_dot,X_dot[model2_cov])#,var_weights=weights)
    m2_fit = m2.fit()

    model1 = GLS_LL(y_dot, X_dot[model1_cov], model=m1)
    ll1 = model1.loglikeobs(m1_fit.params)
    grad1 = model1.score_obs(m1_fit.params)
    hess1 = model1.hessian(m1_fit.params)
    params1 = m1_fit.params

    model2 = GLS_LL(y_dot, X_dot[model2_cov], model=m2)

    ll2 = model2.loglikeobs(m2_fit.params)
    grad2 = model2.score_obs(m2_fit.params)
    hess2 = model2.hessian(m2_fit.params)
    params2 = m2_fit.params
    
    return  ll1, grad1, hess1, params1, ll2, grad2, hess2, params2



def pairwise_tests(y_name,model_xs,data):
    y_dot,X_dot,x_name,missing_vals = setup_data(y_name,model_xs,data)
    
    
    #TODO fix this so that it does all the comparison
    combos = list(itertools.combinations(model_xs,2))
    labels = [ 'll'+ str(i+1) for i in range(len(model_xs))]
    label_combos = list(itertools.combinations(labels,2))
    res = []
    for i in range(len(combos)):
        combo = combos[i]
        label_combo = label_combos[i]
        model1_x = combo[0]
        model2_x = combo[1]
        setup_test_i = lambda yn,xn : setup_test(yn,xn,model1_cov = model1_x, model2_cov= model2_x)
        test_stat,res1,res2,res3 = selection_tests.test_results(y_dot,X_dot,setup_test_i)
        
        print(selection_tests.test_table(y_dot,X_dot,setup_test_i))
        
        res.append( [label_combo,test_stat,res1,res2,res3])
    #print_pairwise_tests(res)
    return res


res = pairwise_tests(['log_enroll'],model_xs,data)
print(res)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
['ins_parent_noSNPs', 'star_C2', 'bm_ns', 'FFS_AB', 'unemploy_rt', 'pc_income', 'log_risk_FFS', 'log_risk_pub_c', 'double_bonus', 'bmFFS_ns_diff']
\begin{center}
\begin{tabular}{ccccc}
\toprule
\textbf{Version} & \textbf{Result} & \textbf{90 \% CI} & \textbf{95 \% CI} & \textbf{99 \% CI} \\ \midrule
Shi (2015) & H0 & [-4.499, 0.274] & [-6.815, 2.589] & [-12.845, 8.620] \\
Classical & H2 & [-3.828, -0.538] & [-4.142, -0.224] & [-4.759, 0.393] \\
Bootstrap & H0 & [-3.832, -0.365] & [-4.234, -0.070] & [-5.068, 0.650] \\
\bottomrule
\end{tabular}
\end{center}
None
[[('ll1', 'll2'), -2.1832269579474612, 95, 85, 95]]
