In [1]:
%load_ext autoreload
%autoreload 2

from statsmodels.base.model import GenericLikelihoodModel

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
import sys
import re

from scipy.interpolate import make_interp_spline, BSpline
from scipy.stats import multivariate_normal

sys.path.append("../")
import vuong_tests11 as vuong_tests_fast

In [2]:
class OLS_loglike(GenericLikelihoodModel):
    
    def __init__(self, *args,ols=False, **kwargs):
        super(OLS_loglike,self).__init__(*args,**kwargs)
        self.ols = ols

    def loglikeobs(self, params):
        y = self.endog
        x = self.exog
        mu_y = np.matmul(x,params)  
        resid = y - mu_y
        sigma = np.sqrt(np.sum(resid**2)/resid.shape[0])
        pr_y = stats.norm.logpdf( resid, loc=0,scale=sigma )
        return pr_y


def setup_shi(yn,xn,return_model=False,num_params=4):
    x1n,x2n = xn[:,0],xn[:,1:num_params+1]
    
    # model 1 grad, etc.
    model1 = sm.OLS(yn,sm.add_constant(x1n))
    model1_fit = model1.fit(disp=False)
    params1 = (model1_fit.params)
    
    model1_deriv = OLS_loglike(yn,sm.add_constant(x1n))
    ll1 = model1_deriv.loglikeobs(model1_fit.params)
    grad1 =  model1_deriv.score_obs(model1_fit.params)    
    hess1 = model1_deriv.hessian(model1_fit.params)
    
    #model 2 grad, etc.
    model2 = sm.OLS(yn,sm.add_constant(x2n))
    model2_fit = model2.fit(disp=False)
    params2 = (model2_fit.params)
    
    model2_deriv = OLS_loglike(yn,sm.add_constant(x2n))
    ll2 = model2_deriv.loglikeobs(model2_fit.params)
    grad2 =  model2_deriv.score_obs(model2_fit.params)    
    hess2 = model2_deriv.hessian(model2_fit.params)
    
    if return_model:
        return ll1,grad1,hess1,params1,model1,ll2,grad2,hess2,params2,model2
    return ll1,grad1,hess1,params1,ll2,grad2,hess2,params2


def gen_data(nobs=1000, a=0.25, num_params=4):
    x = np.random.normal(scale=1., size=(nobs,1+num_params))
    e = np.random.normal(loc=0.0, scale=1.0, size=nobs)
    y = 1 + a*x[:,0] + a/np.sqrt(num_params)*x[:,1:num_params+1].sum(axis=1) + e
    return y,x,nobs

yn,xn,nobs = gen_data()
ll1,grad1,hess1,params1,ll2,grad2,hess2,params2 = setup_shi(yn,xn,return_model=False,num_params=15)
print(grad1.shape,hess1.shape)

(1000, 2) (2, 2)


In [3]:
num_sims = 1000
trials = 1000
adapt_c = True
data_tuned_epsilon = False
epsilon = .5

In [4]:
def get_size_vector(mc_out):
    """
    Returns the size (rejection probability under the null) for each method,
    using your print order:
      Normal, Two-Step, SW Test, Naive Bootstrap, Pairwise Bootstrap, Shi (2015)
    """
    # Unpack
    reg, twostep, sw, boot1, boot2, sw_test_opt, boot3, shi = mc_out[:8]
    # Take 1 - (no selection rate)
    size_vec = [
        1 - reg[0],
        1 - twostep[0],
        1 - (sw_test_opt[0] if data_tuned_epsilon else sw[0]),
        1 - boot1[0],
        1 - (boot3[0] if data_tuned_epsilon else boot2[0]),   # boot3=Pairwise, boot1=Naive
        1 - shi[0],
    ]
    return size_vec

def run_null_size_table(sample_sizes, num_sims, trials, epsilon, data_tuned_epsilon, adapt_c,alpha=.05,
                        gen_data=gen_data, num_params=4):
    table = []
    for nobs in sample_sizes:
        setup_shi_ex = lambda yn,xn: setup_shi(yn,xn)
        gen_data_ex = lambda : gen_data(nobs=nobs, num_params=num_params)
        mc_out = vuong_tests_fast.monte_carlo(
            num_sims,
            gen_data_ex,
            setup_shi_ex,
            trials=trials,
            epsilon=epsilon,
            data_tuned_epsilon = data_tuned_epsilon,
            adapt_c = adapt_c,
            print_stuff=False, alpha=alpha
        )
        size_vec = get_size_vector(mc_out)
        table.append([nobs] + [f"{x:.3f}" for x in size_vec])
    # Print as LaTeX table
    print(r'\begin{tabular}{|c|c|c|c|c|c|c|}')
    print(r'\hline')
    print(r'Sample &  Normal & Two-Step & SW Test & Naive Bootstrap & Pairwise Bootstrap & Shi (2015) \\ \hline \hline')
    for row in table:
        print(' & '.join(str(y) for y in row)+r' \\')
    print(r'\hline')
    print(r'\end{tabular}')
    return table

 # Power examples?

In [5]:
def gen_data2(nobs=1000, a1=np.sqrt(1.09-1), a2=0.0 , num_params=4):
    a1 = np.sqrt(1.09**(250/nobs)-1)
    a2= 0
    x = np.random.normal(scale=1., size=(nobs,1+num_params))
    e = np.random.normal(loc=0.0, scale=1.0, size=nobs)
    y = 1 + a1*x[:,0] + a2/np.sqrt(num_params)*x[:,1:num_params+1].sum(axis=1) + e
    return y,x,nobs

In [6]:
# Set your globals as needed
sample_sizes = [100, 200, 500]
table = run_null_size_table(
    sample_sizes=sample_sizes,
    num_sims=num_sims,                  # you set this already
    trials=trials,                      # you set this already
    epsilon=epsilon,
    data_tuned_epsilon=data_tuned_epsilon,
    adapt_c=adapt_c,alpha=.05, gen_data=gen_data2
)

print(table)

\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
Sample &  Normal & Two-Step & SW Test & Naive Bootstrap & Pairwise Bootstrap & Shi (2015) \\ \hline \hline
100 & 0.480 & 0.468 & 0.373 & 0.311 & 0.258 & 0.502 \\
200 & 0.459 & 0.456 & 0.282 & 0.307 & 0.216 & 0.502 \\
500 & 0.407 & 0.407 & 0.189 & 0.281 & 0.117 & 0.475 \\
\hline
\end{tabular}
[[100, '0.480', '0.468', '0.373', '0.311', '0.258', '0.502'], [200, '0.459', '0.456', '0.282', '0.307', '0.216', '0.502'], [500, '0.407', '0.407', '0.189', '0.281', '0.117', '0.475']]


In [7]:
# Set your globals as needed
table = run_null_size_table(
    sample_sizes=sample_sizes,
    num_sims=num_sims,                  # you set this already
    trials=trials,                      # you set this already
    epsilon=epsilon,
    data_tuned_epsilon=data_tuned_epsilon,
    adapt_c=adapt_c,alpha=.025, gen_data=gen_data2
)

print(table)

\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
Sample &  Normal & Two-Step & SW Test & Naive Bootstrap & Pairwise Bootstrap & Shi (2015) \\ \hline \hline
100 & 0.341 & 0.323 & 0.290 & 0.297 & 0.223 & 0.272 \\
200 & 0.281 & 0.279 & 0.198 & 0.243 & 0.132 & 0.220 \\
500 & 0.271 & 0.271 & 0.119 & 0.217 & 0.084 & 0.241 \\
\hline
\end{tabular}
[[100, '0.341', '0.323', '0.290', '0.297', '0.223', '0.272'], [200, '0.281', '0.279', '0.198', '0.243', '0.132', '0.220'], [500, '0.271', '0.271', '0.119', '0.217', '0.084', '0.241']]


In [8]:
# Set your globals as needed
table = run_null_size_table(
    sample_sizes=sample_sizes,
    num_sims=num_sims,                  # you set this already
    trials=trials,                      # you set this already
    epsilon=epsilon,
    data_tuned_epsilon=data_tuned_epsilon,
    adapt_c=adapt_c,alpha=.01, gen_data=gen_data2
)

print(table)

\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
Sample &  Normal & Two-Step & SW Test & Naive Bootstrap & Pairwise Bootstrap & Shi (2015) \\ \hline \hline
100 & 0.206 & 0.182 & 0.165 & 0.199 & 0.134 & 0.087 \\
200 & 0.153 & 0.151 & 0.104 & 0.186 & 0.094 & 0.064 \\
500 & 0.140 & 0.140 & 0.058 & 0.171 & 0.042 & 0.066 \\
\hline
\end{tabular}
[[100, '0.206', '0.182', '0.165', '0.199', '0.134', '0.087'], [200, '0.153', '0.151', '0.104', '0.186', '0.094', '0.064'], [500, '0.140', '0.140', '0.058', '0.171', '0.042', '0.066']]


# original example from the paper

In [9]:
# Set your globals as needed
sample_sizes = [100, 200, 500]
table = run_null_size_table(
    sample_sizes=sample_sizes,
    num_sims=num_sims,                  # you set this already
    trials=trials,                      # you set this already
    epsilon=epsilon,
    data_tuned_epsilon=data_tuned_epsilon,
    adapt_c=adapt_c,alpha=.05
)

print(table)

\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
Sample &  Normal & Two-Step & SW Test & Naive Bootstrap & Pairwise Bootstrap & Shi (2015) \\ \hline \hline
100 & 0.061 & 0.054 & 0.056 & 0.111 & 0.032 & 0.016 \\
200 & 0.049 & 0.049 & 0.053 & 0.130 & 0.035 & 0.019 \\
500 & 0.040 & 0.040 & 0.057 & 0.148 & 0.034 & 0.019 \\
\hline
\end{tabular}
[[100, '0.061', '0.054', '0.056', '0.111', '0.032', '0.016'], [200, '0.049', '0.049', '0.053', '0.130', '0.035', '0.019'], [500, '0.040', '0.040', '0.057', '0.148', '0.034', '0.019']]


In [10]:
# Set your globals as needed
table = run_null_size_table(
    sample_sizes=sample_sizes,
    num_sims=num_sims,                  # you set this already
    trials=trials,                      # you set this already
    epsilon=epsilon,
    data_tuned_epsilon=data_tuned_epsilon,
    adapt_c=adapt_c,alpha=.025
)

print(table)

\begin{tabular}{|c|c|c|c|c|c|c|}
\hline
Sample &  Normal & Two-Step & SW Test & Naive Bootstrap & Pairwise Bootstrap & Shi (2015) \\ \hline \hline
100 & 0.016 & 0.014 & 0.025 & 0.099 & 0.021 & 0.000 \\
200 & 0.020 & 0.020 & 0.018 & 0.090 & 0.011 & 0.004 \\
500 & 0.021 & 0.021 & 0.024 & 0.089 & 0.017 & 0.008 \\
\hline
\end{tabular}
[[100, '0.016', '0.014', '0.025', '0.099', '0.021', '0.000'], [200, '0.020', '0.020', '0.018', '0.090', '0.011', '0.004'], [500, '0.021', '0.021', '0.024', '0.089', '0.017', '0.008']]


In [None]:
# Set your globals as needed
table = run_null_size_table(
    sample_sizes=sample_sizes,
    num_sims=num_sims,                  # you set this already
    trials=trials,                      # you set this already
    epsilon=epsilon,
    data_tuned_epsilon=data_tuned_epsilon,
    adapt_c=adapt_c,alpha=.01
)

print(table)