In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import datetime
import scipy.stats as stats

#graphing
import matplotlib.pyplot as plt
#stats
import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel
from sklearn.mixture import GaussianMixture 
#import testing
import sys
sys.path.append("../")
import selection_tests

import itertools

In [2]:
#figure out a list of plans that occur every year
data = pd.read_stata("all_plans_c_bonus.dta")
data = data[ (data['year'] >= 2006) & (data['year'] <= 2016)]

ssa_unique = list(data['ssa'][data['year']==2006].unique())
for year in range(2006,2017):

    ssa_year = list(data[data['year']==year]['ssa'].unique())
    ssa_unique_copy = []
    for ssa in ssa_unique:
        if ssa in ssa_year:
            ssa_unique_copy.append(ssa)
    ssa_unique = ssa_unique_copy
    
data = data[data['ssa'].apply(lambda x : x in ssa_unique)]
data.to_stata("all_plans_c_bonus_balanced.data")

In [3]:
data = pd.read_stata("all_plans_c_bonus_balanced.dta")
data['log_enroll'] = np.log(data['enr_c'])
data['enr_total'] = data['enr_c'] + data['enr_FFS']
data['bmFFS_ns_diff'] = data['bmFFS'] - data['bm_ns']
data = data[ (data['year'] >= 2006) & (data['year'] <= 2016)]

#create a variable that is just db 
treat = data[['ssa', 'state', 'county', 'double_bonus']].copy()
treat = treat.groupby(['ssa', 'state', 'county'],as_index=False).max()
treat = treat.rename(columns={'double_bonus':'treatment' })
data = data.merge(treat, on=['ssa', 'state', 'county'],how='left')


#create interaction with year and db
data['treat*trend'] = data['treatment']*(data['year'])*(data['year']<=2012)
data['control*trend'] = (1-data['treatment'])*(data['year'])*(data['year']<=2012)


data['treat2*trend'] = data['bm_ns']*(data['year'])*(data['year']<=2012)


print(data.columns)
print(data[data['double_bonus']==1]['bmFFS'].mean())
print(data['bmFFS'].mean())
print(data[['benchmark_diff',
           'benchmark_diff_n',
           'benchmark_diff_ns',
           'bmFFS']].mean())

print(data['star_C2'].mean())

Index(['index', 'ssa', 'state', 'county', 'year', 'enr_FFS', 'enr_c',
       'hhi_ins', 'hhi_ins_noSNP', 'ins_parent', 'ins_parent_noSNPs',
       'ins_plans', 'HMO_share', 'PPO_share', 'qual_2012', 'qual_2013',
       'qual_2014', 'qual_2015', 'partaenrollment', 'partb_enrollment',
       'prescription_drugs', 'prev_comp_dental', 'eye_exams', 'hearing_exams',
       'deductible', 'partb_premium', 'plan_premium', 'partd_premium', 'OOPC',
       'risk_pub_p', 'bid_pub_p', 'rebate_pub_p', 'risk_pub_c', 'bid_pub_c',
       'rebate_pub_c', 'star_C2', 'star_CD2', 'bmFFS', 'bm_ns', 'risk_FFS',
       'FFS_AB', 'FFS_AB_rs', 'buydown', 'OOPC_noprem', 'extras', 'quartile',
       'bid_pub_p_nominal', 'bid_pub_c_nominal', 'rebate_pub_p_nominal',
       'rebate_pub_c_nominal', 'bmFFS_nominal', 'FFS_AB_nominal',
       'bm_ns_nominal', 'OOPC_nominal', 'plan_premium_nominal',
       'partd_premium_nominal', 'buydown_nominal', 'rebate_std',
       'benchmark_diff', 'benchmark_diff_n', 'benchmark_dif

In [4]:
summary_xs1=['double_bonus', 'bm_ns','bmFFS_ns_diff','star_C2',  'ins_parent_noSNPs',  'FFS_AB', 
                           'unemploy_rt', 'pc_income', 'log_risk_FFS', 'log_risk_pub_c', 'risk_FFS',
                           'risk_pub_c', 'rebate_pub_c']

def join_print(t1,t2,
               summary_xs=summary_xs1 ):
    table =  pd.DataFrame(index=summary_xs)
    table['1'] = t1
    table['2'] = t2
    
    for row in table.itertuples():
        listrow = list(row)
        print('\\textbf{%s}'%listrow[0].replace('_','\\_'),end='')
        for i in range(len(listrow)-1):
            print(' & %.4f '%listrow[i+1],end='')
        print('\\\\')

join_print(data[summary_xs1].mean(),data[summary_xs1].std())

\textbf{double\_bonus} & 0.0368  & 0.1883 \\
\textbf{bm\_ns} & 821.1864  & 78.0948 \\
\textbf{bmFFS\_ns\_diff} & 11.1493  & 16.7855 \\
\textbf{star\_C2} & 7.9856  & 14.1657 \\
\textbf{ins\_parent\_noSNPs} & 2.9417  & 1.7726 \\
\textbf{FFS\_AB} & 715.5549  & 108.4442 \\
\textbf{unemploy\_rt} & 7.0000  & 2.9168 \\
\textbf{pc\_income} & 35.7265  & 9.9710 \\
\textbf{log\_risk\_FFS} & -4.2188  & 7.4582 \\
\textbf{log\_risk\_pub\_c} & -9.8707  & 12.8613 \\
\textbf{risk\_FFS} & 0.9613  & 0.0710 \\
\textbf{risk\_pub\_c} & 0.9135  & 0.1171 \\
\textbf{rebate\_pub\_c} & 49.0642  & 33.2120 \\


In [5]:
#https://www.statsmodels.org/dev/examples/notebooks/generated/glm_weights.html

def drop_data(data,y_name,x_name,absorb=None):
    data = data.copy()
    data = data[y_name + x_name + absorb]
    missing_vals = ~data.isnull().max(axis=1)
    data = data[missing_vals]
    data = data[data['year'].groupby(data['ssa']).transform('count')>=11]
    return data
    

def demean(y_name,x_name,data=None,absorb=None,cluster=None): 

    y,X = data[ y_name], data[ x_name ]
    
    y_dot = y.copy()
    X_dot = X.copy()
    
    ybar = y.mean()
    Xbar = X.mean()

    
    for effect in absorb:
        y_dot = y_dot - y.groupby(data[effect]).transform('mean')
        X_dot = X_dot - X.groupby(data[effect]).transform('mean')
        print(effect,X_dot,X_dot.max(),X_dot.min())
        print('-----')
        print('-----')
    y_dot = y_dot + ybar
    X_dot = X_dot + Xbar
    return y_dot, X_dot


y_dot, X_dot = demean(['log_enroll'],['treatment'], data=data,absorb=['ssa','year'])
print(data[['year','treatment']].groupby(['year']).mean())
print(data[['year','treatment']].groupby(['year']).count())
print(data[['year','treatment']].groupby(['year']).sum())

ssa        treatment
0            0.0
1            0.0
2            0.0
3            0.0
4            0.0
...          ...
30098        0.0
30099        0.0
30100        0.0
30101        0.0
30102        0.0

[30103 rows x 1 columns] treatment    0.0
dtype: float64 treatment    0.0
dtype: float64
-----
-----
year        treatment
0      -0.096862
1      -0.091173
2      -0.089070
3      -0.089483
4      -0.089584
...          ...
30098  -0.090387
30099  -0.090713
30100  -0.094630
30101  -0.094987
30102  -0.095274

[30103 rows x 1 columns] treatment   -0.08907
dtype: float64 treatment   -0.096862
dtype: float64
-----
-----
      treatment
year           
2006   0.096862
2007   0.091173
2008   0.089070
2009   0.089483
2010   0.089584
2011   0.089808
2012   0.090387
2013   0.090713
2014   0.094630
2015   0.094987
2016   0.095274
      treatment
year           
2006       2581
2007       2753
2008       2818
2009       2805
2010       2813
2011       2806
2012       2788
2013       2778
20

In [6]:
model1_x = ['double_bonus','treat*trend','control*trend','treatment','bmFFS_ns_diff','star_C2','log_risk_pub_c',
            'FFS_AB',"ins_parent_noSNPs",'log_risk_FFS','unemploy_rt','pc_income']

model2_x = ['bm_ns','treat2*trend','bmFFS_ns_diff','star_C2','log_risk_pub_c',
            'FFS_AB',"ins_parent_noSNPs",'log_risk_FFS','unemploy_rt','pc_income']
    

model_xs = [model1_x,model2_x]



def setup_data(y_name,model_xs,data):
     #get the super set of all the model names
    all_xs = set()
    for model_x in model_xs:
        all_xs = all_xs.union(set(model_x))
    all_xs = list(all_xs)
    
    #subtract out the columns that are not in the data
    x_name = []
    for col in data.columns:
        if col in all_xs:
            x_name.append(col)
    print(x_name)
    
    #clean the data
    y_dot, X_dot = demean(y_name,x_name, data=data,absorb=['ssa','year'])
    missing_vals = ~data[y_name + x_name].isnull().max(axis=1)
    y_dot, X_dot = y_dot[missing_vals],X_dot[missing_vals]
    
    return y_dot,X_dot,x_name,missing_vals



def return_results(y_name,model_xs,data,weights=True):
    y_dot,X_dot,x_name,missing_vals = setup_data(y_name,model_xs,data)
    print(X_dot['treatment'])
    print('-----')
    params = []
    se = []
    for model_x in model_xs:
        if weights:
            var_weights = np.array( data['enr_c'][missing_vals] )
            X_dot_m = X_dot[model_x].copy()
            model = sm.GLM(y_dot,X_dot_m,var_weights=var_weights)
            model_fit = model.fit()
            params.append(model_fit.params)
            se.append(model_fit.bse)
    table  = pd.DataFrame(index=x_name)
    col_names = []
    for i in range(len(model_xs)):
        table['params %i'%(i+1)]  = params[i]
        table['se %i'%(i+1)]  = se[i]
    return table
    

table = return_results(['log_enroll'],model_xs,data)

['ins_parent_noSNPs', 'star_C2', 'bm_ns', 'FFS_AB', 'unemploy_rt', 'pc_income', 'log_risk_FFS', 'log_risk_pub_c', 'double_bonus', 'bmFFS_ns_diff', 'treatment', 'treat*trend', 'control*trend', 'treat2*trend']
ssa        ins_parent_noSNPs  star_C2      bm_ns      FFS_AB  unemploy_rt  \
0              -1.000000      NaN  41.706177   61.085083    -2.909091   
1               0.000000      NaN  48.287659   41.841003    -2.909091   
2               0.000000      NaN  47.086365   55.909119    -1.109091   
3               1.000000      NaN  77.222900   16.604004     3.490909   
4               0.000000      NaN  47.896973   -7.392639     2.690909   
...                  ...      ...        ...         ...          ...   
30098           0.727273     -0.3 -29.373230  -15.769043     0.027273   
30099          -0.272727     -0.3 -23.456055   19.687622    -0.472727   
30100          -0.272727      0.2  -9.863464  -19.683899    -0.772727   
30101          -0.272727      0.2 -26.613464   -2.093872  

In [7]:
def table_to_latex(table):
    num_col = len(table.columns)
    print('\\begin{tabular}{l',end='')
    for i in range(int(len(table.columns)/2-1)):
        print('cc|',end='')
    print('cc}')
    print('\\toprule')
    print('{}',end='')
    model = 1
    while model <= len(table.columns)/2:
        print('& \\textbf{coef %s} & \\textbf{se %s}'%(model,model),end='' )
        model = model +1 
    print('\\\\')
    print('\\midrule')

    
    for row in table.itertuples():
        listrow = list(row)
        print('\\textbf{%s}'%listrow[0].replace('_','\\_'),end='')
        for i in range(len(listrow)-1):
            print(' & %.4f '%listrow[i+1],end='')
        print('\\\\')
    print('\\bottomrule')
    print('\\end{tabular}')
    
table_to_latex(table)

\begin{tabular}{lcc|cc}
\toprule
{}& \textbf{coef 1} & \textbf{se 1}& \textbf{coef 2} & \textbf{se 2}\\
\midrule
\textbf{ins\_parent\_noSNPs} & -0.0173  & 0.0016  & -0.0144  & 0.0016 \\
\textbf{star\_C2} & -0.0002  & 0.0003  & -0.0008  & 0.0003 \\
\textbf{bm\_ns} & nan  & nan  & 0.0003  & 0.0001 \\
\textbf{FFS\_AB} & 0.0004  & 0.0000  & 0.0002  & 0.0000 \\
\textbf{unemploy\_rt} & -0.0151  & 0.0024  & -0.0123  & 0.0024 \\
\textbf{pc\_income} & -0.0080  & 0.0007  & -0.0081  & 0.0007 \\
\textbf{log\_risk\_FFS} & 0.0013  & 0.0010  & 0.0026  & 0.0010 \\
\textbf{log\_risk\_pub\_c} & 0.0002  & 0.0005  & 0.0002  & 0.0005 \\
\textbf{double\_bonus} & -0.0063  & 0.0096  & nan  & nan \\
\textbf{bmFFS\_ns\_diff} & -0.0004  & 0.0002  & -0.0008  & 0.0001 \\
\textbf{treatment} & 2.8430  & 0.8328  & nan  & nan \\
\textbf{treat*trend} & 0.0010  & 0.0001  & nan  & nan \\
\textbf{control*trend} & 0.0009  & 0.0001  & nan  & nan \\
\textbf{treat2*trend} & nan  & nan  & -0.0000  & 0.0000 \\
\bottomrule
\end{

In [8]:
%load_ext autoreload
%autoreload 2
import selection_tests


class GLS_LL(GenericLikelihoodModel):

    def __init__(self, *args, model=None, **kwargs):
        super(GLS_LL, self).__init__(*args, **kwargs)
        self.model = model
        
    def loglikeobs(self, params, scale=None):
        """
        Evaluate the log-likelihood for a generalized linear model.
        """
        model = self.model
        scale = sm.tsa.stattools.float_like(scale, "scale", optional=True)
        lin_pred = np.dot(model.exog, params) + model._offset_exposure
        expval = model.family.link.inverse(lin_pred)
        if scale is None:
            scale = model.estimate_scale(expval)
        llf = model.family.loglike_obs(model.endog, expval, model.var_weights,
                                  scale)
        return llf



#this is janky as f.... need to fix it...
def setup_test(y_dot,X_dot,
    model1_cov = [],
    model2_cov = []):
    
    #model 1
    #weights = np.array( data['enr_c_mean'][missing_vals] )
    m1 = sm.GLM(y_dot,X_dot[model1_cov])#,var_weights=weights)
    m1_fit = m1.fit()

    #model2
    m2 = sm.GLM(y_dot,X_dot[model2_cov])#,var_weights=weights)
    m2_fit = m2.fit()

    model1 = GLS_LL(y_dot, X_dot[model1_cov], model=m1)
    ll1 = model1.loglikeobs(m1_fit.params)
    grad1 = model1.score_obs(m1_fit.params)
    hess1 = model1.hessian(m1_fit.params)
    params1 = m1_fit.params

    model2 = GLS_LL(y_dot, X_dot[model2_cov], model=m2)

    ll2 = model2.loglikeobs(m2_fit.params)
    grad2 = model2.score_obs(m2_fit.params)
    hess2 = model2.hessian(m2_fit.params)
    params2 = m2_fit.params
    
    return  ll1, grad1, hess1, params1, ll2, grad2, hess2, params2



def pairwise_tests(y_name,model_xs,data):
    y_dot,X_dot,x_name,missing_vals = setup_data(y_name,model_xs,data)
    
    #setup the latent variable
    c =  create_latent(y_dot, X_dot, missing_vals)
    
    #setup interactions
    X_dot['bm_ns*c'] = X_dot['bm_ns']*c
    X_dot['bm_ns*log_risk_pub_c'] = X_dot['bm_ns']*X_dot['log_risk_pub_c']
    
    #TODO fix this so that it does all the comparison
    combos = list(itertools.combinations(model_xs,2))
    labels = [ 'll'+ str(i+1) for i in range(len(model_xs))]
    label_combos = list(itertools.combinations(labels,2))
    res = []
    for i in range(len(combos)):
        combo = combos[i]
        label_combo = label_combos[i]
        model1_x = combo[0]
        model2_x = combo[1]
        setup_test_i = lambda yn,xn : setup_test(yn,xn,model1_cov = model1_x, model2_cov= model2_x)
        test_stat,res1,res2,res3 = selection_tests.test_results(y_dot,X_dot,setup_test_i)
        
        print(selection_tests.test_table(y_dot,X_dot,setup_test_i))
        
        res.append( [label_combo,test_stat,res1,res2,res3])
    #print_pairwise_tests(res)
    return res


res = pairwise_tests(['log_enroll'],model_xs,data)
print(res)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
['ins_parent_noSNPs', 'star_C2', 'bm_ns', 'FFS_AB', 'unemploy_rt', 'pc_income', 'log_risk_FFS', 'log_risk_pub_c', 'double_bonus', 'bmFFS_ns_diff', 'treatment', 'treat*trend', 'control*trend', 'treat2*trend']
ssa        ins_parent_noSNPs  star_C2      bm_ns      FFS_AB  unemploy_rt  \
0              -1.000000      NaN  41.706177   61.085083    -2.909091   
1               0.000000      NaN  48.287659   41.841003    -2.909091   
2               0.000000      NaN  47.086365   55.909119    -1.109091   
3               1.000000      NaN  77.222900   16.604004     3.490909   
4               0.000000      NaN  47.896973   -7.392639     2.690909   
...                  ...      ...        ...         ...          ...   
30098           0.727273     -0.3 -29.373230  -15.769043     0.027273   
30099          -0.272727     -0.3 -23.456055   19.687622    -0.472727   
30100          -0.272727      0.2  -9.86346

NameError: name 'create_latent' is not defined