In [96]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel
import matplotlib.pyplot as plt
from scipy import stats
from scipy import cluster

In [97]:
n = 1000
beta01, beta11 = 5,-3
beta02, beta12 = 2, 4

#set up regression mixture
x1 = np.random.uniform(0, 10, size=400)
x2 = np.random.uniform(0, 10, size=600)

y1 = beta01 + beta11*x1 + np.random.normal(scale=5.0, size=400)
y2 = beta02 + beta12*x2 + np.random.normal(scale=4.0,size=600)

X = np.concatenate([x1, x2])
Y = np.concatenate([y1, y2])


#set up 2 component mixture
a1 = np.random.normal(2, 5, size=600)
a2 = np.random.normal(5, 3, size=400)
a = np.concatenate([a1,a2])

In [98]:
def e_step(y,x,params): 
    y, x = np.array(y), np.array(x)
    nobs, k = x.shape
    weights = []
    for param in params:

        sigma = param[-1]
        beta = np.tile(param[1:-1],nobs).reshape(nobs, k)
        mean = (beta*x).sum(axis=1)
        weights.append( stats.norm.pdf(y, loc=mean, scale=sigma)*param[0] )
        
    #update loop variables
    weights = np.array(weights).transpose()
    #denom = np.repeat( 1+np.exp(weights).sum(axis=1), len(params) ).reshape(nobs,len(params))
    denom = np.repeat(weights.sum(axis=1), len(params) ).reshape(nobs,len(params))
    weights = weights/denom
    return weights, np.log(denom[:,0])
        
    
def m_step(y,x,weights):
    y, x, weights = np.array(y), np.array(x), np.array(weights)
    nobs, k = x.shape
    params, se, err = [], [], 0

    for w in weights.transpose():
        
        lamb = w.mean()
        lamb_se = w.std()

        #beta
        w_mat = np.diag(w)
        xx_mat = np.linalg.inv( x.transpose().dot( w_mat).dot(x) )
        beta = xx_mat.dot(x.transpose().dot(w_mat)).dot(y)
        
        #sigma
        mu = np.tile(beta, nobs).reshape(nobs, k)*x
        weighted_err = w*(y - mu.sum(axis=1))**2
        sigma =  (weighted_err.sum()/w.sum())**.5

        #add component
        comp_param =np.concatenate(([lamb],beta,[sigma]))
        params.append(comp_param)

        #beta_se
        beta_se = (np.diagonal(xx_mat*sigma**2))**.5
        comp_se = np.concatenate(([lamb_se],beta_se))
        se.append(comp_se)

        #SSR
        err = err+weighted_err
    return np.array(params), np.array(se), 1-err.mean()/y.var()


def gen_weights(y,ncomp):
    c,labels = cluster.vq.kmeans2(y,ncomp)
    return np.array(pd.get_dummies(labels))


def estimate(y,x,ncomp):
    e = gen_weights(y,ncomp)
    m = None
    for i in range(20):
        m,se,r2 = m_step(y,x,e)
        e,ll = e_step(y,x,m)
    return m, se, r2, y, x, ncomp, e, ll


m, se, r2, y, x, ncomp, classes, ll = estimate(Y, sm.add_constant(X), 1)

In [99]:
def write_table(fname, estimates, labels=('y',None)):
    
    #unpack relevant information
    params, se, r2, y, x, ncomp, ll, classes = estimates
    nobs, k = x.shape
    ylabel, xlabel = labels
    
    #calc aic
    aic = 2*(params.shape[0]*params.shape[1]-2) - 2*ll.sum()
    aic = np.round(aic,1)
    
    if xlabel == None:
        xlabel =[]
        for i in range(k):
            xlabel.append('x%s'%i)
            
    assert (k == len(xlabel)) 
    
    f = open(fname, "w+")
    
    f.write(('\\small \n'+
            '\\begin{tabular}{lclc} \n'+
            '\\hline \n'+
            '\\textbf{Dep. Variable:} & %s & \\textbf{  R-squared: } &  %s \\\\ \n'%(ylabel, np.round(r2,3))  ))
    
    f.write(('\\textbf{No. Observations:} & %s & \\textbf{ AIC:} & %s \\\\ \n'%(nobs,aic)+
                                                                                    
            '\end{tabular} \n'))
    
    
    f.write('\n\\begin{tabular}{lcccc} \n')
    for comp in range(ncomp):
        f.write('\\hline \n')
        f.write('\\textbf{Component %s} & \\textbf{Estimate} & \\textbf{Std. Error} &'%(1+comp)+ 
                '\\textbf{t} & \\textbf{P $>$ $|$ t $|$} \\\\ \n')
        f.write('\\hline \\\\ \n')
        
        #isolate params
        comp_params = params[comp]
        comp_se = se[comp]
        comp_t = comp_params[:-1]/comp_se
        comp_p = 1 - stats.t.cdf(np.abs(comp_t),df=(nobs-k)) + stats.t.cdf(-np.abs(comp_t),df=(nobs-k))
        
        #round everything
        comp_params = np.round(comp_params,5)
        comp_se = np.round(comp_se,5)
        comp_t = np.round(comp_t,5)
        comp_p = np.round(comp_p,5)
        
        lamb, lamb_se = comp_params[0], comp_se[0]
        #lamb_t, lamb_p = comp_params[0],  comp_t[0], comp_p[0]
        beta, beta_se, beta_t, beta_p = comp_params[1:-1], comp_se[1:], comp_t[1:], comp_p[1:]
        sigma = comp_params[-1]
        
        if ncomp > 1:
            f.write('\\textbf{Weight %s} & %.3f  & %.3f & & \\\\ \\\\ \n'%(comp+1, lamb, lamb_se) )
        
        for i in range(k):
            f.write('\\textbf{%s} & %.3f & (%.3f) & %.3f & %.3f \\\\ \\\\ \n'%(xlabel[i],beta[i],beta_se[i],
                                                                             beta_t[i],beta_p[i]) )
        
        #f.write('\\textbf{Variance} & %s &  & & \\\\ \\\\ \n'%(sigma) )
    f.write('\\hline \\\\ \n')    
    f.write('\end{tabular} \n')
    f.close()
    
    #print output
    f = open(fname, "r")
    print(f.read())
    f.close()

In [100]:
reg1 = pd.read_csv('data/clean_milk1.csv')
print(reg1.columns)

#variables names
lmilk = ['LSCORE']
auct_key = ['YEAR','MONTH','DAY','SYSTEM','FMOZONE']
lcts = ['LFMO','LGAS','LPOPUL','LQSCORE']#,'LMEALS']
dummies = ['COOLER','ESC', 'NUM']
fekeys = ['3','6','7','9']

maxlag = 4
limitedlag = [l+str(i) for l in ['LSCORE_min','LSCORE_max'] for i in range(1,1+maxlag)]

maxlagy = 2
limitedlagy = [l+str(i) for l in ['LSCORE_miny','LSCORE_maxy'] for i in range(1,1+maxlagy)]


bid_key = auct_key + ['VENDOR'] + ['COUNTY']
covariates = lcts + dummies + fekeys
hist = ['INC'] + limitedlag
all_hist = hist + limitedlagy 

print(covariates)
print(hist)
print(all_hist)

Index(['Unnamed: 0', 'YEAR', 'MONTH', 'DAY', 'SYSTEM', 'FMOZONE', 'VENDOR',
       'COUNTY', 'LSCORE', 'LFMO', 'LGAS', 'LPOPUL', 'LQSCORE', 'COOLER',
       'ESC', 'NUM', '3', '6', '7', '9', 'INC', 'LSCORE_min1', 'LSCORE_min2',
       'LSCORE_min3', 'LSCORE_min4', 'LSCORE_max1', 'LSCORE_max2',
       'LSCORE_max3', 'LSCORE_max4', 'LSCORE_miny1', 'LSCORE_miny2',
       'LSCORE_maxy1', 'LSCORE_maxy2', 'WIN'],
      dtype='object')
['LFMO', 'LGAS', 'LPOPUL', 'LQSCORE', 'COOLER', 'ESC', 'NUM', '3', '6', '7', '9']
['INC', 'LSCORE_min1', 'LSCORE_min2', 'LSCORE_min3', 'LSCORE_min4', 'LSCORE_max1', 'LSCORE_max2', 'LSCORE_max3', 'LSCORE_max4']
['INC', 'LSCORE_min1', 'LSCORE_min2', 'LSCORE_min3', 'LSCORE_min4', 'LSCORE_max1', 'LSCORE_max2', 'LSCORE_max3', 'LSCORE_max4', 'LSCORE_miny1', 'LSCORE_miny2', 'LSCORE_maxy1', 'LSCORE_maxy2']


In [101]:
nice_ww = 'Bids (log-log)'
nice_cov = ['(Intercept)', 'Raw milk', 'Gas',
            'Population', 'Quantity', #'Meals',
            'Cooler', 'Escalated','No. Bidders', #+ fekeys
            'Waco','St. Angelo', 'Austin', 'San Antonio']

nice_lags = [l+str(i) for l in ['Min. at auction t-', 'Max. at auction t-'] for i in range(1,1+maxlag)]
nice_lags = ['Incumbency'] + nice_lags
nice_lagsy = [l+str(i) for l in ['Min. in year t-', 'Max. in year t-'] for i in range(1,1+maxlagy)]

all_lags = nice_lags + nice_lagsy

In [102]:
#baseline ols

est1 = estimate(reg1['LSCORE'],sm.add_constant(reg1[covariates]),1)
write_table('results/ols_results.tex', est1, labels=(nice_ww, nice_cov))

\small 
\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Bids (log-log) & \textbf{  R-squared: } &  0.166 \\ 
\textbf{No. Observations:} & 3153 & \textbf{ AIC:} & -6282.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
\textbf{Component 1} & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline \\ 
\textbf{(Intercept)} & -2.268 & (0.085) & -26.665 & 0.000 \\ \\ 
\textbf{Raw milk} & 0.223 & (0.030) & 7.319 & 0.000 \\ \\ 
\textbf{Gas} & 0.009 & (0.005) & 1.652 & 0.099 \\ \\ 
\textbf{Population} & 0.016 & (0.004) & 3.861 & 0.000 \\ \\ 
\textbf{Quantity} & -0.021 & (0.004) & -4.947 & 0.000 \\ \\ 
\textbf{Cooler} & 0.018 & (0.004) & 4.993 & 0.000 \\ \\ 
\textbf{Escalated} & -0.029 & (0.003) & -8.912 & 0.000 \\ \\ 
\textbf{No. Bidders} & 0.008 & (0.002) & 4.718 & 0.000 \\ \\ 
\textbf{Waco} & -0.070 & (0.005) & -13.618 & 0.000 \\ \\ 
\textbf{St. Angelo} & -0.041 & (0.015) & -2.691 & 0.007 \\ \\ 
\textbf{Austin} & -0.107 & (0.017) & -6.429 & 0.00



In [103]:
#historical covariates

est2 = estimate(reg1['LSCORE'],sm.add_constant(reg1[covariates + hist]),1)
write_table('results/hist_results.tex', est2, labels=(nice_ww, nice_cov + nice_lags))

\small 
\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Bids (log-log) & \textbf{  R-squared: } &  0.205 \\ 
\textbf{No. Observations:} & 3153 & \textbf{ AIC:} & -6264.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
\textbf{Component 1} & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline \\ 
\textbf{(Intercept)} & -1.509 & (0.117) & -12.871 & 0.000 \\ \\ 
\textbf{Raw milk} & 0.152 & (0.031) & 4.956 & 0.000 \\ \\ 
\textbf{Gas} & 0.012 & (0.005) & 2.171 & 0.030 \\ \\ 
\textbf{Population} & 0.009 & (0.004) & 2.150 & 0.032 \\ \\ 
\textbf{Quantity} & -0.016 & (0.004) & -3.748 & 0.000 \\ \\ 
\textbf{Cooler} & 0.017 & (0.003) & 4.870 & 0.000 \\ \\ 
\textbf{Escalated} & -0.028 & (0.003) & -8.648 & 0.000 \\ \\ 
\textbf{No. Bidders} & 0.007 & (0.002) & 4.462 & 0.000 \\ \\ 
\textbf{Waco} & -0.072 & (0.005) & -14.385 & 0.000 \\ \\ 
\textbf{St. Angelo} & -0.047 & (0.015) & -3.120 & 0.002 \\ \\ 
\textbf{Austin} & -0.111 & (0.016) & -6.778 & 0.00



In [104]:
#robust history with prev years

est3 = estimate(reg1['LSCORE'],sm.add_constant(reg1[covariates + all_hist]),1)
write_table('results/hist_results_robust.tex', est3, labels=(nice_ww, nice_cov + all_lags))

\small 
\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Bids (log-log) & \textbf{  R-squared: } &  0.415 \\ 
\textbf{No. Observations:} & 3153 & \textbf{ AIC:} & -6256.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
\textbf{Component 1} & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline \\ 
\textbf{(Intercept)} & -0.959 & (0.104) & -9.221 & 0.000 \\ \\ 
\textbf{Raw milk} & 0.326 & (0.027) & 12.066 & 0.000 \\ \\ 
\textbf{Gas} & -0.016 & (0.005) & -3.463 & 0.001 \\ \\ 
\textbf{Population} & 0.009 & (0.003) & 2.475 & 0.013 \\ \\ 
\textbf{Quantity} & -0.014 & (0.004) & -3.911 & 0.000 \\ \\ 
\textbf{Cooler} & 0.002 & (0.003) & 0.504 & 0.615 \\ \\ 
\textbf{Escalated} & -0.019 & (0.003) & -6.944 & 0.000 \\ \\ 
\textbf{No. Bidders} & 0.007 & (0.001) & 5.091 & 0.000 \\ \\ 
\textbf{Waco} & -0.030 & (0.005) & -6.525 & 0.000 \\ \\ 
\textbf{St. Angelo} & -0.031 & (0.013) & -2.402 & 0.016 \\ \\ 
\textbf{Austin} & -0.014 & (0.014) & -0.948 & 0.3



In [94]:
#swtiching regression

est4 = estimate(reg1['LSCORE'],sm.add_constant(reg1[covariates]),2)
write_table('results/prelim_results.tex', est4, labels=(nice_ww, nice_cov))

\small 
\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Bids (log-log) & \textbf{  R-squared: } &  0.455 \\ 
\textbf{No. Observations:} & 3153 & \textbf{ AIC:} & -6254.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
\textbf{Mixture 1} & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline \\ 
\textbf{Weight 1} & 0.648  & 0.263 & & \\ \\ 
\textbf{(Intercept)} & -2.141 & (0.098) & -21.834 & 0.000 \\ \\ 
\textbf{Raw milk} & 0.177 & (0.035) & 5.028 & 0.000 \\ \\ 
\textbf{Gas} & -0.010 & (0.006) & -1.692 & 0.091 \\ \\ 
\textbf{Population} & 0.024 & (0.005) & 4.892 & 0.000 \\ \\ 
\textbf{Quantity} & -0.022 & (0.005) & -4.288 & 0.000 \\ \\ 
\textbf{Cooler} & 0.017 & (0.004) & 4.086 & 0.000 \\ \\ 
\textbf{Escalated} & -0.027 & (0.004) & -6.994 & 0.000 \\ \\ 
\textbf{No. Bidders} & 0.004 & (0.002) & 2.010 & 0.044 \\ \\ 
\textbf{Waco} & -0.018 & (0.006) & -3.170 & 0.002 \\ \\ 
\textbf{St. Angelo} & -0.028 & (0.018) & -1.580 & 0.114 \\ \\ 
\text

In [95]:
punish = est4[-2].mean(axis=0).argmin()

classes = reg1.copy()[['SYSTEM','FMOZONE','YEAR','MONTH','DAY']]

classes['classes'] = 1.*(est4[-2][:,punish]>.5)
classes['prob'] =  1.*(est4[-2][:,punish])
classes['SCORE'] = np.exp(reg1['LSCORE'])
classes.to_csv('data/classes.csv')

est5 = estimate(classes['classes'],sm.add_constant(reg1[all_hist[1:]]),1)
write_table('results/phase_res.tex', est5, labels=('Punishment', ['(Intercept)'] + all_lags[1:]))

\small 
\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Punishment & \textbf{  R-squared: } &  0.106 \\ 
\textbf{No. Observations:} & 3153 & \textbf{ AIC:} & -6280.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
\textbf{Mixture 1} & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline \\ 
\textbf{(Intercept)} & -3.490 & (0.313) & -11.142 & 0.000 \\ \\ 
\textbf{Min. at auction t-1} & -0.016 & (0.089) & -0.181 & 0.856 \\ \\ 
\textbf{Min. at auction t-2} & -0.310 & (0.092) & -3.364 & 0.001 \\ \\ 
\textbf{Min. at auction t-3} & -0.017 & (0.091) & -0.186 & 0.853 \\ \\ 
\textbf{Min. at auction t-4} & -0.141 & (0.092) & -1.539 & 0.124 \\ \\ 
\textbf{Max. at auction t-1} & -0.137 & (0.094) & -1.466 & 0.143 \\ \\ 
\textbf{Max. at auction t-2} & 0.132 & (0.095) & 1.382 & 0.167 \\ \\ 
\textbf{Max. at auction t-3} & 0.017 & (0.093) & 0.186 & 0.853 \\ \\ 
\textbf{Max. at auction t-4} & 0.038 & (0.090) & 0.424 & 0.672 \\ \\ 
\textbf{Min. in year t-



In [83]:
test1 = estimate(Y, sm.add_constant(X), 2)
test2 = estimate(Y, sm.add_constant(X), 1)

def nonnested_test(model1,model2):
    """test for non nested models quang vuong"""
    
    params1, se1, r21, y1, x1, ncomp1, classes1, ll1 = model1
    params2, se2, r22, y2, x2, ncomp2, classes2, ll2 = model2
    nobs, k = x1.shape
    
    k1 = params1.shape[1]*ncomp1 - 1 
    k2 = params2.shape[1]*ncomp2 - 1
    
    var1 = (ll1 -ll2).std()
    test1 = (ll1.sum() - ll2.sum() - k1 + k2)*nobs**(-.5)
    test1 = test1/var1
    p1 = 1 - stats.t.cdf(np.abs(test1),df=(nobs-k1-k2)) + stats.t.cdf(-np.abs(test1),df=(nobs-k1-k2))
    
    var2 =  ((ll1 - ll2)**2).mean()**.5
    test2 = (ll1.sum() - ll2.sum() - k1 + k2 )*nobs**(-.5)
    test2 = test2/var2
    p2 = 1 - stats.t.cdf(np.abs(test2),df=(nobs-k1-k2)) + stats.t.cdf(-np.abs(test2),df=(nobs-k1-k2))
    
    return test1, test2, p1, p2

print(nonnested_test(est4,est2))

(7.375233452761843, 7.309465825837097, 2.0940828357264675e-13, 3.395121162877036e-13)


In [84]:
def write_nonnested(model1,model2,fname):
    test1, test2, p1, p2 = nonnested_test(model1,model2)
    test1, test2, p1, p2 = np.round(test1,5), np.round(test2,5), np.round(p1,5), np.round(p2,5) 
    f = open(fname, "w+")
    f.write('\\begin{tabular}{lcc}')
    f.write('\n\\hline \n & \\textbf{t} & \\textbf{P $>$ $|$ t $|$} \\\\')
    f.write('\n\\hline')
    f.write('\n\\textbf{Test 1} & %s & %s \\\\'%(test1,p1))
    f.write('\n\\textbf{Test 2} & %s & %s \\\\'%(test2,p2))
    f.write('\\hline \\\\ \n')   
    f.write('\n\\end{tabular}\n')
    f.close()

write_nonnested(est2,est1,'results/test_stat.tex')