In [1]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel
import matplotlib.pyplot as plt
from scipy import stats
from scipy import cluster

In [2]:
n = 1000
beta01, beta11 = 5,-3
beta02, beta12 = 2, 4

#set up regression mixture
x1 = np.random.uniform(0, 10, size=400)
x2 = np.random.uniform(0, 10, size=600)

y1 = beta01 + beta11*x1 + np.random.normal(scale=5.0, size=400)
y2 = beta02 + beta12*x2 + np.random.normal(scale=4.0,size=600)

X = np.concatenate([x1, x2])
Y = np.concatenate([y1, y2])


#set up 2 component mixture
a1 = np.random.normal(2, 5, size=600)
a2 = np.random.normal(5, 3, size=400)
a = np.concatenate([a1,a2])

In [3]:
def e_step(y,x,params): 
    y, x = np.array(y), np.array(x)
    nobs, k = x.shape
    weights = []
    for param in params:

        sigma = param[-1]
        beta = np.tile(param[1:-1],nobs).reshape(nobs, k)
        mean = (beta*x).sum(axis=1)
        weights.append( stats.norm.pdf(y, loc=mean, scale=sigma)*param[0] )
        
    #update loop variables
    weights = np.array(weights).transpose()
    #denom = np.repeat( 1+np.exp(weights).sum(axis=1), len(params) ).reshape(nobs,len(params))
    denom = np.repeat(weights.sum(axis=1), len(params) ).reshape(nobs,len(params))
    weights = weights/denom
    return weights, np.log(denom[:,0])
        
    
def m_step(y,x,weights):
    y, x, weights = np.array(y), np.array(x), np.array(weights)
    nobs, k = x.shape
    params, se, err = [], [], 0

    for w in weights.transpose():
        
        lamb = w.mean()
        lamb_se = w.std()

        #beta
        w_mat = np.diag(w)
        xx_mat = np.linalg.inv( x.transpose().dot( w_mat).dot(x) )
        beta = xx_mat.dot(x.transpose().dot(w_mat)).dot(y)
        
        #sigma
        mu = np.tile(beta, nobs).reshape(nobs, k)*x
        weighted_err = w*(y - mu.sum(axis=1))**2
        sigma =  (weighted_err.sum()/w.sum())**.5

        #add component
        comp_param =np.concatenate(([lamb],beta,[sigma]))
        params.append(comp_param)

        #beta_se
        beta_se = (np.diagonal(xx_mat*sigma**2))**.5
        comp_se = np.concatenate(([lamb_se],beta_se))
        se.append(comp_se)

        #SSR
        err = err+weighted_err
    return np.array(params), np.array(se), 1-err.mean()/y.var()


def gen_weights(y,ncomp):
    c,labels = cluster.vq.kmeans2(y,ncomp)
    return np.array(pd.get_dummies(labels))


def estimate(y,x,ncomp):
    e = gen_weights(y,ncomp)
    m = None
    for i in range(20):
        m,se,r2 = m_step(y,x,e)
        e,ll = e_step(y,x,m)
    return m, se, r2, y, x, ncomp, e, ll


m, se, r2, y, x, ncomp, classes, ll = estimate(Y, sm.add_constant(X), 1)

In [4]:
def write_table(fname, estimates, labels=('y',None)):
    
    #unpack relevant information
    params, se, r2, y, x, ncomp, ll, classes = estimates
    nobs, k = x.shape
    ylabel, xlabel = labels
    
    #calc aic
    aic = 2*(params.shape[0]*params.shape[1]-2) - 2*ll.sum()
    aic = np.round(aic,1)
    
    if xlabel == None:
        xlabel =[]
        for i in range(k):
            xlabel.append('x%s'%i)
            
    assert (k == len(xlabel)) 
    
    f = open(fname, "w+")
    
    f.write(('\\small \n'+
            '\\begin{tabular}{lclc} \n'+
            '\\hline \n'+
            '\\textbf{Dep. Variable:} & %s & \\textbf{  R-squared: } &  %s \\\\ \n'%(ylabel, np.round(r2,3))  ))
    
    f.write(('\\textbf{No. Observations:} & %s & \\textbf{ AIC:} & %s \\\\ \n'%(nobs,aic)+
                                                                                    
            '\end{tabular} \n'))
    
    
    f.write('\n\\begin{tabular}{lcccc} \n')
    for comp in range(ncomp):
        f.write('\\hline \n')
        f.write('\\textbf{Phase %s} & \\textbf{Estimate} & \\textbf{Std. Error} &'%(1+comp)+ 
                '\\textbf{t} & \\textbf{P $>$ $|$ t $|$} \\\\ \n')
        f.write('\\hline \\\\ \n')
        
        #isolate params
        comp_params = params[comp]
        comp_se = se[comp]
        comp_t = comp_params[:-1]/comp_se
        comp_p = 1 - stats.t.cdf(np.abs(comp_t),df=(nobs-k)) + stats.t.cdf(-np.abs(comp_t),df=(nobs-k))
        
        #round everything
        comp_params = np.round(comp_params,5)
        comp_se = np.round(comp_se,5)
        comp_t = np.round(comp_t,5)
        comp_p = np.round(comp_p,5)
        
        lamb, lamb_se = comp_params[0], comp_se[0]
        #lamb_t, lamb_p = comp_params[0],  comp_t[0], comp_p[0]
        beta, beta_se, beta_t, beta_p = comp_params[1:-1], comp_se[1:], comp_t[1:], comp_p[1:]
        sigma = comp_params[-1]
        
        if ncomp > 1:
            f.write('\\textbf{Pr(phase %s)} & %s  & %s & & \\\\ \\\\ \n'%(comp+1, lamb, lamb_se) )
        
        for i in range(k):
            f.write('\\textbf{%s} & %s & (%s) & %s & %s \\\\ \\\\ \n'%(xlabel[i],beta[i],beta_se[i],
                                                                             beta_t[i],beta_p[i]) )
        
        #f.write('\\textbf{Variance} & %s &  & & \\\\ \\\\ \n'%(sigma) )
    f.write('\\hline \\\\ \n')    
    f.write('\end{tabular} \n')
    f.close()
    
    #print output
    f = open(fname, "r")
    print(f.read())
    f.close()

In [5]:
reg1 = pd.read_csv('data/clean_milk1.csv')
print(reg1.columns)

#variables names
lmilk = ['LSCORE']
auct_key = ['YEAR','MONTH','DAY','SYSTEM','FMOZONE']
lcts = ['LFMO','LGAS','LPOPUL','LQSCORE']#,'LMEALS']
dummies = ['COOLER','ESC', 'ONEBID','NUM']

fekeys = list(reg1.columns[17:-12])


lags = 5
lagkeys = [l+str(i) for l in ['LSCORE_min','LSCORE_max'] for i in range(1,1+lags)]

bid_key = auct_key + ['VENDOR'] + ['COUNTY']
covariates = lcts + dummies + fekeys
hist = ['INC'] + lagkeys

print(covariates)
print(hist)

Index(['Unnamed: 0', 'YEAR', 'MONTH', 'DAY', 'SYSTEM', 'FMOZONE', 'VENDOR',
       'COUNTY', 'LSCORE', 'LFMO', 'LGAS', 'LPOPUL', 'LQSCORE', 'COOLER',
       'ESC', 'ONEBID', 'NUM', '3', '6', '7', '9', 'INC', 'LSCORE_min1',
       'LSCORE_min2', 'LSCORE_min3', 'LSCORE_min4', 'LSCORE_min5',
       'LSCORE_max1', 'LSCORE_max2', 'LSCORE_max3', 'LSCORE_max4',
       'LSCORE_max5', 'WIN'],
      dtype='object')
['LFMO', 'LGAS', 'LPOPUL', 'LQSCORE', 'COOLER', 'ESC', 'ONEBID', 'NUM', '3', '6', '7', '9']
['INC', 'LSCORE_min1', 'LSCORE_min2', 'LSCORE_min3', 'LSCORE_min4', 'LSCORE_min5', 'LSCORE_max1', 'LSCORE_max2', 'LSCORE_max3', 'LSCORE_max4', 'LSCORE_max5']


In [6]:
nice_ww = 'Bids (log-log)'
nice_cov = ['(Intercept)', 'Raw milk', 'Gas',
            'Population', 'Quantity', #'Meals',
            'Cooler', 'Escalated', 'Monopoly' ,'No. Bidders', #+ fekeys
            'Waco','St. Angelo', 'Austin', 'San Antonio']
nice_lags = [l+str(i) for l in ['Min. lag \#','Max. lag \#'] for i in range(1,1+lags)]
nice_lags = ['Incumbency'] + nice_lags

In [7]:
est0 = estimate(reg1['LSCORE'],sm.add_constant(reg1[covariates]),1)
write_table('results/ols_results.tex', est0, labels=(nice_ww, nice_cov))

  return ptp(axis=axis, out=out, **kwargs)


\small 
\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Bids (log-log) & \textbf{  R-squared: } &  0.166 \\ 
\textbf{No. Observations:} & 4040 & \textbf{ AIC:} & -8054.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
\textbf{Phase 1} & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline \\ 
\textbf{(Intercept)} & -2.28614 & (0.07704) & -29.67319 & 0.0 \\ \\ 
\textbf{Raw milk} & 0.21012 & (0.02823) & 7.44213 & 0.0 \\ \\ 
\textbf{Gas} & 0.02365 & (0.00425) & 5.56968 & 0.0 \\ \\ 
\textbf{Population} & 0.0151 & (0.00326) & 4.63236 & 0.0 \\ \\ 
\textbf{Quantity} & -0.01997 & (0.00342) & -5.83102 & 0.0 \\ \\ 
\textbf{Cooler} & 0.01851 & (0.0031) & 5.96343 & 0.0 \\ \\ 
\textbf{Escalated} & -0.0273 & (0.0029) & -9.41336 & 0.0 \\ \\ 
\textbf{Monopoly} & 0.01815 & (0.00583) & 3.11206 & 0.00187 \\ \\ 
\textbf{No. Bidders} & 0.00775 & (0.00157) & 4.92298 & 0.0 \\ \\ 
\textbf{Waco} & -0.06971 & (0.00436) & -15.98917 & 0.0 \\ \\ 
\textbf{St. Angelo



In [8]:
est1 = estimate(reg1['LSCORE'],sm.add_constant(reg1[covariates + hist]),1)
write_table('results/hist_results.tex', est1, labels=(nice_ww, nice_cov + nice_lags))

\small 
\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Bids (log-log) & \textbf{  R-squared: } &  0.208 \\ 
\textbf{No. Observations:} & 4040 & \textbf{ AIC:} & -8032.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
\textbf{Phase 1} & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline \\ 
\textbf{(Intercept)} & -1.34236 & (0.10976) & -12.23018 & 0.0 \\ \\ 
\textbf{Raw milk} & 0.12469 & (0.02839) & 4.39156 & 1e-05 \\ \\ 
\textbf{Gas} & 0.02207 & (0.00421) & 5.24695 & 0.0 \\ \\ 
\textbf{Population} & 0.00907 & (0.00322) & 2.81635 & 0.00488 \\ \\ 
\textbf{Quantity} & -0.01602 & (0.00337) & -4.74925 & 0.0 \\ \\ 
\textbf{Cooler} & 0.01741 & (0.00303) & 5.74009 & 0.0 \\ \\ 
\textbf{Escalated} & -0.0257 & (0.00283) & -9.06904 & 0.0 \\ \\ 
\textbf{Monopoly} & 0.00806 & (0.00628) & 1.28404 & 0.1992 \\ \\ 
\textbf{No. Bidders} & 0.00655 & (0.00154) & 4.24428 & 2e-05 \\ \\ 
\textbf{Waco} & -0.07273 & (0.00426) & -17.07025 & 0.0 \\ \\ 
\textbf{



In [9]:
lags2 = 10
lagkeys2 = [l+str(i) for l in ['LSCORE_min','LSCORE_max'] for i in range(1,1+lags2)]
nice_lags2 = [l+str(i) for l in ['Min. lag \#','Max. lag \#'] for i in range(1,1+lags2)]
nice_lags2 = ['Incumbency'] + nice_lags2
hist2 = ['INC'] + lagkeys2

reg2 = pd.read_csv('data/clean_milk2.csv')
est4 = estimate(reg2['LSCORE'],sm.add_constant(reg2[covariates + hist2]),1)
write_table('results/hist_results2.tex', est4, labels=(nice_ww, nice_cov + nice_lags2))

\small 
\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Bids (log-log) & \textbf{  R-squared: } &  0.222 \\ 
\textbf{No. Observations:} & 4027 & \textbf{ AIC:} & -7986.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
\textbf{Phase 1} & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline \\ 
\textbf{(Intercept)} & -0.89207 & (0.12601) & -7.0796 & 0.0 \\ \\ 
\textbf{Raw milk} & 0.09123 & (0.02859) & 3.19087 & 0.00143 \\ \\ 
\textbf{Gas} & 0.01823 & (0.00423) & 4.31004 & 2e-05 \\ \\ 
\textbf{Population} & 0.00694 & (0.00321) & 2.16011 & 0.03082 \\ \\ 
\textbf{Quantity} & -0.01471 & (0.00336) & -4.38298 & 1e-05 \\ \\ 
\textbf{Cooler} & 0.01705 & (0.00302) & 5.64209 & 0.0 \\ \\ 
\textbf{Escalated} & -0.02499 & (0.00282) & -8.86033 & 0.0 \\ \\ 
\textbf{Monopoly} & 0.00771 & (0.00626) & 1.23144 & 0.21823 \\ \\ 
\textbf{No. Bidders} & 0.00611 & (0.00154) & 3.95282 & 8e-05 \\ \\ 
\textbf{Waco} & -0.07426 & (0.00424) & -17.50899 & 0.0 \\ \\ 
\t



In [10]:
est2 = estimate(reg1['LSCORE'],sm.add_constant(reg1[covariates]),2)
write_table('results/prelim_results.tex', est2, labels=(nice_ww, nice_cov))

\small 
\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Bids (log-log) & \textbf{  R-squared: } &  0.436 \\ 
\textbf{No. Observations:} & 4040 & \textbf{ AIC:} & -8024.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
\textbf{Phase 1} & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline \\ 
\textbf{Pr(phase 1)} & 0.30682  & 0.24155 & & \\ \\ 
\textbf{(Intercept)} & -2.77158 & (0.07732) & -35.84615 & 0.0 \\ \\ 
\textbf{Raw milk} & 0.40316 & (0.02803) & 14.38359 & 0.0 \\ \\ 
\textbf{Gas} & 0.016 & (0.00452) & 3.53681 & 0.00041 \\ \\ 
\textbf{Population} & -0.00458 & (0.00303) & -1.50882 & 0.13142 \\ \\ 
\textbf{Quantity} & -0.00951 & (0.00319) & -2.98142 & 0.00289 \\ \\ 
\textbf{Cooler} & 0.01683 & (0.00318) & 5.29592 & 0.0 \\ \\ 
\textbf{Escalated} & -0.01158 & (0.00289) & -4.01229 & 6e-05 \\ \\ 
\textbf{Monopoly} & 0.01148 & (0.00599) & 1.91799 & 0.05518 \\ \\ 
\textbf{No. Bidders} & 0.01316 & (0.00159) & 8.2834 & 0.0 \\ \\ 
\textbf{W

In [11]:
punish = est2[-2].mean(axis=0).argmin()

classes = reg1.copy()[['SYSTEM','FMOZONE','YEAR','MONTH','DAY']]

classes['classes'] = 1.*(est2[-2][:,punish]>.5)
classes['prob'] =  1.*(est2[-2][:,punish])
classes['SCORE'] = np.exp(reg1['LSCORE'])
classes.to_csv('data/classes.csv')

est3 = estimate(classes['classes'],sm.add_constant(reg1[hist[1:]]),1)
write_table('results/phase_res.tex', est3, labels=('Punishment', ['(Intercept)'] + nice_lags[1:]))

\small 
\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Punishment & \textbf{  R-squared: } &  0.015 \\ 
\textbf{No. Observations:} & 4040 & \textbf{ AIC:} & -8058.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
\textbf{Phase 1} & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline \\ 
\textbf{(Intercept)} & -0.83762 & (0.20403) & -4.10529 & 4e-05 \\ \\ 
\textbf{Min. lag \#1} & 0.00267 & (0.06505) & 0.04104 & 0.96726 \\ \\ 
\textbf{Min. lag \#2} & -0.26284 & (0.06569) & -4.0013 & 6e-05 \\ \\ 
\textbf{Min. lag \#3} & -0.18554 & (0.0662) & -2.80259 & 0.00509 \\ \\ 
\textbf{Min. lag \#4} & -0.0121 & (0.06616) & -0.18283 & 0.85494 \\ \\ 
\textbf{Min. lag \#5} & -0.26809 & (0.06459) & -4.1507 & 3e-05 \\ \\ 
\textbf{Max. lag \#1} & -0.14741 & (0.06799) & -2.16823 & 0.0302 \\ \\ 
\textbf{Max. lag \#2} & 0.16855 & (0.06706) & 2.51351 & 0.01199 \\ \\ 
\textbf{Max. lag \#3} & -0.06614 & (0.06732) & -0.98259 & 0.32587 \\ \\ 
\textbf{Max. lag \#



In [14]:
test1 = estimate(Y, sm.add_constant(X), 2)
test2 = estimate(Y, sm.add_constant(X), 1)

def nonnested_test(model1,model2):
    """test for non nested models quang vuong"""
    
    params1, se1, r21, y1, x1, ncomp1, classes1, ll1 = model1
    params2, se2, r22, y2, x2, ncomp2, classes2, ll2 = model2
    nobs, k = x1.shape
    
    k1 = params1.shape[1]*ncomp1 - 1 
    k2 = params2.shape[1]*ncomp2 - 1
    
    var1 = (ll1 -ll2).std()
    test1 = (ll1.sum() - ll2.sum() - k1 + k2)*nobs**(-.5)
    test1 = test1/var1
    p1 = 1 - stats.t.cdf(np.abs(test1),df=(nobs-k1-k2)) + stats.t.cdf(-np.abs(test1),df=(nobs-k1-k2))
    
    var2 =  ((ll1 - ll2)**2).mean()**.5
    test2 = (ll1.sum() - ll2.sum() - k1 + k2 )*nobs**(-.5)
    test2 = test2/var2
    p2 = 1 - stats.t.cdf(np.abs(test2),df=(nobs-k1-k2)) + stats.t.cdf(-np.abs(test2),df=(nobs-k1-k2))
    
    return test1, test2, p1, p2

print(nonnested_test(est2,est1))

(6.478109281955905, 6.443351593498305, 1.0418530351974065e-10, 1.3077738496458997e-10)


In [15]:
def write_nonnested(model1,model2,fname):
    test1, test2, p1, p2 = nonnested_test(model1,model2)
    test1, test2, p1, p2 = np.round(test1,5), np.round(test2,5), np.round(p1,5), np.round(p2,5) 
    f = open(fname, "w+")
    f.write('\\begin{tabular}{lcc}')
    f.write('\n\\hline \n & \\textbf{t} & \\textbf{P $>$ $|$ t $|$} \\\\')
    f.write('\n\\hline')
    f.write('\n\\textbf{Test 1} & %s & %s \\\\'%(test1,p1))
    f.write('\n\\textbf{Test 2} & %s & %s \\\\'%(test2,p2))
    f.write('\\hline \\\\ \n')   
    f.write('\n\\end{tabular}\n')
    f.close()

write_nonnested(est2,est1,'results/test_stat.tex')