In [1]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel
import matplotlib.pyplot as plt
from scipy import stats
from scipy import cluster

In [2]:
n = 1000
beta01, beta11 = 5,-3
beta02, beta12 = 2, 4

#set up regression mixture
x1 = np.random.uniform(0, 10, size=400)
x2 = np.random.uniform(0, 10, size=600)

y1 = beta01 + beta11*x1 + np.random.normal(scale=5.0, size=400)
y2 = beta02 + beta12*x2 + np.random.normal(scale=4.0,size=600)

X = np.concatenate([x1, x2])
Y = np.concatenate([y1, y2])


#set up 2 component mixture
a1 = np.random.normal(2, 5, size=600)
a2 = np.random.normal(5, 3, size=400)
a = np.concatenate([a1,a2])

In [3]:
def e_step(y,x,params): 
    y, x = np.array(y), np.array(x)
    nobs, k = x.shape
    weights = []
    for param in params:

        sigma = param[-1]
        beta = np.tile(param[1:-1],nobs).reshape(nobs, k)
        mean = (beta*x).sum(axis=1)
        weights.append( stats.norm.pdf(y, loc=mean, scale=sigma)*param[0] )
        
    #update loop variables
    weights = np.array(weights).transpose()
    denom = np.repeat(weights.sum(axis=1), len(params) ).reshape(nobs,len(params))
    weights = weights/denom
    return weights, np.log(denom[:,0])
        
    
def m_step(y,x,weights):
    y, x, weights = np.array(y), np.array(x), np.array(weights)
    nobs, k = x.shape
    params, se, err = [], [], 0

    for w in weights.transpose():
        
        lamb = w.mean()
        lamb_se = w.std()

        #beta
        w_mat = np.diag(w)
        xx_mat = np.linalg.inv( x.transpose().dot( w_mat).dot(x) )
        beta = xx_mat.dot(x.transpose().dot(w_mat)).dot(y)
        
        #sigma
        mu = np.tile(beta, nobs).reshape(nobs, k)*x
        weighted_err = w*(y - mu.sum(axis=1))**2
        sigma =  (weighted_err.sum()/w.sum())**.5

        #add component
        comp_param =np.concatenate(([lamb],beta,[sigma]))
        params.append(comp_param)

        #beta_se
        beta_se = (np.diagonal(xx_mat*sigma**2))**.5
        comp_se = np.concatenate(([lamb_se],beta_se))
        se.append(comp_se)

        #SSR
        err = err+weighted_err
    return np.array(params), np.array(se), 1-err.mean()/y.var()


def gen_weights(y,ncomp):
    c,labels = cluster.vq.kmeans2(y,ncomp)
    return np.array(pd.get_dummies(labels))


def estimate(y,x,ncomp):
    e = gen_weights(y,ncomp)
    m = None
    for i in range(20):
        m,se,r2 = m_step(y,x,e)
        e,ll = e_step(y,x,m)
    return m, se, r2, y, x, ncomp, e, ll


m, se, r2, y, x, ncomp, classes, ll = estimate(Y, sm.add_constant(X), 1)

In [4]:
def write_table(fname, estimates, labels=('y',None)):
    
    #unpack relevant information
    params, se, r2, y, x, ncomp, ll, classes = estimates
    nobs, k = x.shape
    ylabel, xlabel = labels
    
    #calc aic
    aic = 2*(params.shape[0]*params.shape[1]-2) - 2*ll.sum()
    aic = np.round(aic,1)
    
    if xlabel == None:
        xlabel =[]
        for i in range(k):
            xlabel.append('x%s'%i)   
    assert (k == len(xlabel)) 
    
    
    f = open(fname, "w+")
    
    f.write((
            '\\begin{tabular}{lclc} \n'+
            '\\hline \n'+
            '\\textbf{Dep. Variable:} & %s & \\textbf{  R-squared: } &  %s \\\\ \n'%(ylabel, np.round(r2,3))  ))
    
    f.write(('\\textbf{No. Observations:} & %s & \\textbf{ AIC:} & %s \\\\ \n'%(nobs,aic)+
                                                                                    
            '\end{tabular} \n'))
    
    
    f.write('\n\\begin{tabular}{lcccc} \n')
    for comp in range(ncomp):
        f.write('\\hline \n')
        f.write('\\textbf{Component %s} & \\textbf{Estimate} & \\textbf{Std. Error} &'%(1+comp)+ 
                '\\textbf{t} & \\textbf{P $>$ $|$ t $|$} \\\\ \n')
        f.write('\\hline \n')
        
        #isolate params
        comp_params = params[comp]
        comp_se = se[comp]
        comp_t = comp_params[:-1]/comp_se
        comp_p = 1 - stats.t.cdf(np.abs(comp_t),df=(nobs-k)) + stats.t.cdf(-np.abs(comp_t),df=(nobs-k))
        
        #round everything
        comp_params = np.round(comp_params,5)
        comp_se = np.round(comp_se,5)
        comp_t = np.round(comp_t,5)
        comp_p = np.round(comp_p,5)
        
        lamb, lamb_se = comp_params[0], comp_se[0]
        #lamb_t, lamb_p = comp_params[0],  comp_t[0], comp_p[0]
        beta, beta_se, beta_t, beta_p = comp_params[1:-1], comp_se[1:], comp_t[1:], comp_p[1:]
        sigma = comp_params[-1]
        
        if ncomp > 1:
            f.write('\\textbf{Weight %s} & %.3f  & %.3f & & \\\\  \n'%(comp+1, lamb, lamb_se) )
        
        for i in range(k):
            f.write('\\textbf{%s} & %.3f & (%.3f) & %.3f & %.3f \\\\ \n'%(xlabel[i],beta[i],beta_se[i],
                                                                             beta_t[i],beta_p[i]) )
        
        #f.write('\\textbf{Variance} & %s &  & & \\\\ \\\\ \n'%(sigma) )
    f.write('\\hline \\\\ \n')    
    f.write('\end{tabular} \n')
    f.close()
    
    #print output
    f = open(fname, "r")
    print(f.read())
    f.close()

In [5]:
reg1 = pd.read_csv('data/clean_milk1.csv')
print(reg1.columns)

#variables names
lmilk = ['LSCORE']
auct_key = ['YEAR','MONTH','DAY','SYSTEM','FMOZONE']
lcts = ['LFMO','LGAS','LPOPUL','LQSCORE']#,'LMEALS']
dummies = ['COOLER','ESC', 'NUM']
fekeys = ['3','6','7','9']

maxlag = 4
limitedlag = [l+str(i) for l in ['LSCORE_min','LSCORE_max'] for i in range(1,1+maxlag)]

maxlagy = 2
limitedlagy = [l+str(i) for l in ['LSCORE_miny','LSCORE_maxy'] for i in range(1,1+maxlagy)]


bid_key = auct_key + ['VENDOR'] + ['COUNTY']
covariates = lcts + dummies + fekeys
hist = ['INC','PAST_AUCT','min*past'] + limitedlag
all_hist = hist + limitedlagy 

print(covariates)
print(hist)
print(all_hist)

Index([u'Unnamed: 0', u'YEAR', u'MONTH', u'DAY', u'SYSTEM', u'FMOZONE',
       u'VENDOR', u'COUNTY', u'LSCORE', u'LFMO', u'LGAS', u'LPOPUL',
       u'LQSCORE', u'COOLER', u'ESC', u'NUM', u'3', u'6', u'7', u'9',
       u'PAST_AUCT', u'min*past', u'LSCORE_min1', u'LSCORE_min2',
       u'LSCORE_min3', u'LSCORE_min4', u'LSCORE_max1', u'LSCORE_max2',
       u'LSCORE_max3', u'LSCORE_max4', u'LSCORE_miny1', u'LSCORE_miny2',
       u'LSCORE_maxy1', u'LSCORE_maxy2', u'INC', u'WIN'],
      dtype='object')
['LFMO', 'LGAS', 'LPOPUL', 'LQSCORE', 'COOLER', 'ESC', 'NUM', '3', '6', '7', '9']
['INC', 'PAST_AUCT', 'min*past', 'LSCORE_min1', 'LSCORE_min2', 'LSCORE_min3', 'LSCORE_min4', 'LSCORE_max1', 'LSCORE_max2', 'LSCORE_max3', 'LSCORE_max4']
['INC', 'PAST_AUCT', 'min*past', 'LSCORE_min1', 'LSCORE_min2', 'LSCORE_min3', 'LSCORE_min4', 'LSCORE_max1', 'LSCORE_max2', 'LSCORE_max3', 'LSCORE_max4', 'LSCORE_miny1', 'LSCORE_miny2', 'LSCORE_maxy1', 'LSCORE_maxy2']


In [6]:
nice_ww = 'Bids (log-log)'
nice_cov = ['(Intercept)', 'Raw milk', 'Gas',
            'Population', 'Quantity', #'Meals',
            'Cooler', 'Escalated','No. Bidders', #+ fekeys
            'Waco','St. Angelo', 'Austin', 'San Antonio']

nice_lags = [l+str(i) for l in ['Min at auction t-', 'Max at auction t-'] for i in range(1,1+maxlag)]
nice_lags = ['Incumbency','Past auction', 'Min * Past Auction'] + nice_lags
nice_lagsy = [l+str(i) for l in ['Min in year t-', 'Max in year t-'] for i in range(1,1+maxlagy)]

all_lags = nice_lags + nice_lagsy

In [7]:
#baseline ols

est1 = estimate(reg1['LSCORE'],sm.add_constant(reg1[covariates]),1)
write_table('results/ols_results.tex', est1, labels=(nice_ww, nice_cov))

\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Bids (log-log) & \textbf{  R-squared: } &  0.166 \\ 
\textbf{No. Observations:} & 3153 & \textbf{ AIC:} & -6282.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
\textbf{Component 1} & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline 
\textbf{(Intercept)} & -2.268 & (0.085) & -26.665 & 0.000 \\ 
\textbf{Raw milk} & 0.223 & (0.030) & 7.319 & 0.000 \\ 
\textbf{Gas} & 0.009 & (0.005) & 1.652 & 0.099 \\ 
\textbf{Population} & 0.016 & (0.004) & 3.861 & 0.000 \\ 
\textbf{Quantity} & -0.021 & (0.004) & -4.947 & 0.000 \\ 
\textbf{Cooler} & 0.018 & (0.004) & 4.993 & 0.000 \\ 
\textbf{Escalated} & -0.029 & (0.003) & -8.912 & 0.000 \\ 
\textbf{No. Bidders} & 0.008 & (0.002) & 4.718 & 0.000 \\ 
\textbf{Waco} & -0.070 & (0.005) & -13.618 & 0.000 \\ 
\textbf{St. Angelo} & -0.041 & (0.015) & -2.691 & 0.007 \\ 
\textbf{Austin} & -0.107 & (0.017) & -6.429 & 0.000 \\ 
\textbf{San Antonio} & -0.052 & (0.



In [8]:
est2 = estimate(reg1['LSCORE'],sm.add_constant(reg1[covariates + ['INC']]),1)
write_table('results/inc_results.tex', est2, labels=(nice_ww, nice_cov + ['Incumbency']))

\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Bids (log-log) & \textbf{  R-squared: } &  0.183 \\ 
\textbf{No. Observations:} & 3153 & \textbf{ AIC:} & -6280.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
\textbf{Component 1} & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline 
\textbf{(Intercept)} & -2.238 & (0.084) & -26.570 & 0.000 \\ 
\textbf{Raw milk} & 0.217 & (0.030) & 7.196 & 0.000 \\ 
\textbf{Gas} & 0.009 & (0.005) & 1.752 & 0.080 \\ 
\textbf{Population} & 0.015 & (0.004) & 3.785 & 0.000 \\ 
\textbf{Quantity} & -0.021 & (0.004) & -4.882 & 0.000 \\ 
\textbf{Cooler} & 0.017 & (0.004) & 4.939 & 0.000 \\ 
\textbf{Escalated} & -0.030 & (0.003) & -9.304 & 0.000 \\ 
\textbf{No. Bidders} & 0.005 & (0.002) & 3.178 & 0.002 \\ 
\textbf{Waco} & -0.071 & (0.005) & -13.976 & 0.000 \\ 
\textbf{St. Angelo} & -0.041 & (0.015) & -2.694 & 0.007 \\ 
\textbf{Austin} & -0.106 & (0.016) & -6.471 & 0.000 \\ 
\textbf{San Antonio} & -0.054 & (0.



In [9]:
inc_cov =  pd.DataFrame()

for inc_type in ['Inc', '(1 - Inc)']:
    if inc_type == 'Inc':
        inc_cov['Inc'] = reg1['INC']
    else:
        inc_cov['(1 - Inc)'] = (1 - reg1['INC'])
    for i in range(len(covariates)):
        inc_cov[ inc_type + ' * ' + nice_cov[i+1] ] = reg1[covariates[i]]* inc_cov[inc_type]

    
print(inc_cov.columns)

est25 = estimate(reg1['LSCORE'],inc_cov,1)
write_table('results/inc2_results.tex', est25, labels=(nice_ww, list(inc_cov.columns)))

Index([u'Inc', u'Inc * Raw milk', u'Inc * Gas', u'Inc * Population',
       u'Inc * Quantity', u'Inc * Cooler', u'Inc * Escalated',
       u'Inc * No. Bidders', u'Inc * Waco', u'Inc * St. Angelo',
       u'Inc * Austin', u'Inc * San Antonio', u'(1 - Inc)',
       u'(1 - Inc) * Raw milk', u'(1 - Inc) * Gas', u'(1 - Inc) * Population',
       u'(1 - Inc) * Quantity', u'(1 - Inc) * Cooler',
       u'(1 - Inc) * Escalated', u'(1 - Inc) * No. Bidders',
       u'(1 - Inc) * Waco', u'(1 - Inc) * St. Angelo', u'(1 - Inc) * Austin',
       u'(1 - Inc) * San Antonio'],
      dtype='object')
\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Bids (log-log) & \textbf{  R-squared: } &  0.213 \\ 
\textbf{No. Observations:} & 3153 & \textbf{ AIC:} & -6258.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
\textbf{Component 1} & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline 
\textbf{Inc} & -2.294 & (0.164) & -14.009 & 0.000 \\ 
\textbf{Inc * Raw mil



In [10]:
#swtiching regression
est3 = estimate(reg1['LSCORE'],sm.add_constant(reg1[covariates]),2)
write_table('results/prelim_results.tex', est3, labels=(nice_ww, nice_cov))

\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Bids (log-log) & \textbf{  R-squared: } &  0.455 \\ 
\textbf{No. Observations:} & 3153 & \textbf{ AIC:} & -6254.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
\textbf{Component 1} & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline 
\textbf{Weight 1} & 0.648  & 0.263 & & \\  
\textbf{(Intercept)} & -2.141 & (0.098) & -21.837 & 0.000 \\ 
\textbf{Raw milk} & 0.177 & (0.035) & 5.028 & 0.000 \\ 
\textbf{Gas} & -0.010 & (0.006) & -1.692 & 0.091 \\ 
\textbf{Population} & 0.024 & (0.005) & 4.892 & 0.000 \\ 
\textbf{Quantity} & -0.022 & (0.005) & -4.289 & 0.000 \\ 
\textbf{Cooler} & 0.017 & (0.004) & 4.087 & 0.000 \\ 
\textbf{Escalated} & -0.027 & (0.004) & -6.995 & 0.000 \\ 
\textbf{No. Bidders} & 0.004 & (0.002) & 2.011 & 0.044 \\ 
\textbf{Waco} & -0.018 & (0.006) & -3.173 & 0.002 \\ 
\textbf{St. Angelo} & -0.028 & (0.018) & -1.580 & 0.114 \\ 
\textbf{Austin} & -0.006 & (0.020) & -0.287 & 

In [11]:
test1 = estimate(Y, sm.add_constant(X), 2)
test2 = estimate(Y, sm.add_constant(X), 1)

def nonnested_test(model1,model2):
    """test for non nested models quang vuong"""
    
    params1, se1, r21, y1, x1, ncomp1, classes1, ll1 = model1
    params2, se2, r22, y2, x2, ncomp2, classes2, ll2 = model2
    nobs, k = x1.shape
    
    k1 = params1.shape[1]*ncomp1 - 1 
    k2 = params2.shape[1]*ncomp2 - 1
    
    var1 = (ll1 -ll2).std()
    test1 = (ll1.sum() - ll2.sum() - k1 + k2)*nobs**(-.5)
    test1 = test1/var1
    p1 = 1 - stats.t.cdf(np.abs(test1),df=(nobs-k1-k2)) + stats.t.cdf(-np.abs(test1),df=(nobs-k1-k2))
    
    var2 =  ((ll1 - ll2)**2).mean()**.5
    test2 = (ll1.sum() - ll2.sum() - k1 + k2 )*nobs**(-.5)
    test2 = test2/var2
    p2 = 1 - stats.t.cdf(np.abs(test2),df=(nobs-k1-k2)) + stats.t.cdf(-np.abs(test2),df=(nobs-k1-k2))
    
    return test1, test2, p1, p2

print(nonnested_test(est3,est25))

(7.0815860694528965, 7.0248270874972345, 1.755542191521635e-12, 2.6233976093459843e-12)


In [15]:
def write_nonnested(model1,model2,fname):
    test1, test2, p1, p2 = nonnested_test(model1,model2)
    test1, test2, p1, p2 = np.round(test1,5), np.round(test2,5), np.round(p1,5), np.round(p2,5) 
    f = open(fname, "w+")
    f.write('\\begin{tabular}{lcc}')
    f.write('\n\\hline \n & \\textbf{t} & \\textbf{P $>$ $|$ t $|$} \\\\')
    f.write('\n\\hline')
    f.write('\n\\textbf{Test 1} & %s & %s \\\\'%(test1,p1))
    f.write('\n\\textbf{Test 2} & %s & %s \\\\'%(test2,p2))
    f.write('\\hline \\\\ \n')   
    f.write('\n\\end{tabular}\n')
    f.close()

write_nonnested(est25,est3,'results/test_stat.tex')

In [13]:
#historical covariates

est4 = estimate(reg1['LSCORE'],sm.add_constant(reg1[covariates + hist]),1)

print(hist)
write_table('results/hist_results.tex', est4, labels=(nice_ww, nice_cov + nice_lags))

['INC', 'PAST_AUCT', 'min*past', 'LSCORE_min1', 'LSCORE_min2', 'LSCORE_min3', 'LSCORE_min4', 'LSCORE_max1', 'LSCORE_max2', 'LSCORE_max3', 'LSCORE_max4']
\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Bids (log-log) & \textbf{  R-squared: } &  0.224 \\ 
\textbf{No. Observations:} & 3153 & \textbf{ AIC:} & -6260.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
\textbf{Component 1} & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline 
\textbf{(Intercept)} & -1.471 & (0.118) & -12.424 & 0.000 \\ 
\textbf{Raw milk} & 0.137 & (0.030) & 4.525 & 0.000 \\ 
\textbf{Gas} & 0.011 & (0.005) & 2.142 & 0.032 \\ 
\textbf{Population} & 0.009 & (0.004) & 2.197 & 0.028 \\ 
\textbf{Quantity} & -0.017 & (0.004) & -4.032 & 0.000 \\ 
\textbf{Cooler} & 0.016 & (0.003) & 4.578 & 0.000 \\ 
\textbf{Escalated} & -0.029 & (0.003) & -8.948 & 0.000 \\ 
\textbf{No. Bidders} & 0.003 & (0.002) & 2.068 & 0.039 \\ 
\textbf{Waco} & -0.073 & (0.005) & -14.730 & 0.000 \\



In [14]:
#robust history with prev years

est5 = estimate(reg1['LSCORE'],sm.add_constant(reg1[covariates + all_hist]),1)
write_table('results/hist_results_robust.tex', est5, labels=(nice_ww, nice_cov + all_lags))

\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Bids (log-log) & \textbf{  R-squared: } &  0.447 \\ 
\textbf{No. Observations:} & 3153 & \textbf{ AIC:} & -6252.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
\textbf{Component 1} & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline 
\textbf{(Intercept)} & -0.890 & (0.103) & -8.618 & 0.000 \\ 
\textbf{Raw milk} & 0.313 & (0.026) & 11.903 & 0.000 \\ 
\textbf{Gas} & -0.017 & (0.005) & -3.667 & 0.000 \\ 
\textbf{Population} & 0.008 & (0.003) & 2.257 & 0.024 \\ 
\textbf{Quantity} & -0.013 & (0.004) & -3.768 & 0.000 \\ 
\textbf{Cooler} & 0.000 & (0.003) & 0.035 & 0.972 \\ 
\textbf{Escalated} & -0.020 & (0.003) & -7.450 & 0.000 \\ 
\textbf{No. Bidders} & 0.004 & (0.001) & 2.712 & 0.007 \\ 
\textbf{Waco} & -0.031 & (0.004) & -6.960 & 0.000 \\ 
\textbf{St. Angelo} & -0.030 & (0.013) & -2.360 & 0.018 \\ 
\textbf{Austin} & -0.011 & (0.014) & -0.766 & 0.444 \\ 
\textbf{San Antonio} & -0.012 & (0

