In [1]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel
import matplotlib.pyplot as plt
from scipy import stats
from scipy import cluster

In [2]:
n = 1000
beta01, beta11 = 5,-3
beta02, beta12 = 2, 4

#set up regression mixture
x1 = np.random.uniform(0, 10, size=400)
x2 = np.random.uniform(0, 10, size=600)

y1 = beta01 + beta11*x1 + np.random.normal(scale=5.0, size=400)
y2 = beta02 + beta12*x2 + np.random.normal(scale=4.0,size=600)

X = np.concatenate([x1, x2])
Y = np.concatenate([y1, y2])


#set up 2 component mixture
a1 = np.random.normal(2, 5, size=600)
a2 = np.random.normal(5, 3, size=400)
a = np.concatenate([a1,a2])

In [3]:
def e_step(y,x,params): 
    y, x = np.array(y), np.array(x)
    nobs, k = x.shape
    weights = []
    for param in params:

        sigma = param[-1]
        beta = np.tile(param[1:-1],nobs).reshape(nobs, k)
        mean = (beta*x).sum(axis=1)
        weights.append( stats.norm.pdf(y, loc=mean, scale=sigma)*param[0] )
        
    #update loop variables
    weights = np.array(weights).transpose()
    denom = np.repeat(weights.sum(axis=1), len(params) ).reshape(nobs,len(params))
    weights = weights/denom
    return weights, np.log(denom[:,0])
        
    
def m_step(y,x,weights):
    y, x, weights = np.array(y), np.array(x), np.array(weights)
    nobs, k = x.shape
    params, se, err = [], [], 0

    for w in weights.transpose():
        
        lamb = w.mean()
        lamb_se = w.std()

        #beta
        w_mat = np.diag(w)
        xx_mat = np.linalg.inv( x.transpose().dot( w_mat).dot(x) )
        beta = xx_mat.dot(x.transpose().dot(w_mat)).dot(y)
        
        #sigma
        mu = np.tile(beta, nobs).reshape(nobs, k)*x
        weighted_err = w*(y - mu.sum(axis=1))**2
        sigma =  (weighted_err.sum()/w.sum())**.5

        #add component
        comp_param =np.concatenate(([lamb],beta,[sigma]))
        params.append(comp_param)

        #beta_se
        beta_se = (np.diagonal(xx_mat*sigma**2))**.5
        comp_se = np.concatenate(([lamb_se],beta_se))
        se.append(comp_se)

        #SSR
        err = err+weighted_err
    return np.array(params), np.array(se), 1-err.mean()/y.var()


def gen_weights(y,ncomp):
    c,labels = cluster.vq.kmeans2(y,ncomp)
    return np.array(pd.get_dummies(labels))


def estimate(y,x,ncomp):
    e = gen_weights(y,ncomp)
    m = None
    for i in range(20):
        m,se,r2 = m_step(y,x,e)
        e,ll = e_step(y,x,m)
    return m, se, r2, y, x, ncomp, e, ll


m, se, r2, y, x, ncomp, classes, ll = estimate(Y, sm.add_constant(X), 1)

In [4]:
def write_table(fname, estimates, labels=('y',None)):
    
    #unpack relevant information
    params, se, r2, y, x, ncomp, ll, classes = estimates
    nobs, k = x.shape
    ylabel, xlabel = labels
    
    #calc aic
    aic = 2*(params.shape[0]*params.shape[1]-2) - 2*ll.sum()
    aic = np.round(aic,1)
    
    if xlabel == None:
        xlabel =[]
        for i in range(k):
            xlabel.append('x%s'%i)   
    assert (k == len(xlabel)) 
    
    
    f = open(fname, "w+")
    
    f.write((
            '\\begin{tabular}{lclc} \n'+
            '\\hline \n'+
            '\\textbf{Dep. Variable:} & %s & \\textbf{  R-squared: } &  %s \\\\ \n'%(ylabel, np.round(r2,4))  ))
    
    f.write(('\\textbf{No. Observations:} & %s & \\textbf{ AIC:} & %s \\\\ \n'%(nobs,aic)+
                                                                                    
            '\end{tabular} \n'))
    
    
    f.write('\n\\begin{tabular}{lcccc} \n')
    for comp in range(ncomp):
        f.write('\\hline \n')
        
        if ncomp ==1 :
            f.write(' & \\textbf{Estimate} & \\textbf{Std. Error} &'+ 
                '\\textbf{t} & \\textbf{P $>$ $|$ t $|$} \\\\ \n')
        else:
            f.write('\\textbf{Component %s} & \\textbf{Estimate} & \\textbf{Std. Error} &'%(1+comp)+ 
                '\\textbf{t} & \\textbf{P $>$ $|$ t $|$} \\\\ \n')
            
        f.write('\\hline \n')
        
        #isolate params
        comp_params = params[comp]
        comp_se = se[comp]
        comp_t = comp_params[:-1]/comp_se
        comp_p = 1 - stats.t.cdf(np.abs(comp_t),df=(nobs-k)) + stats.t.cdf(-np.abs(comp_t),df=(nobs-k))
        
        #round everything
        comp_params = np.round(comp_params,5)
        comp_se = np.round(comp_se,5)
        comp_t = np.round(comp_t,5)
        comp_p = np.round(comp_p,5)
        
        lamb, lamb_se = comp_params[0], comp_se[0]
        #lamb_t, lamb_p = comp_params[0],  comp_t[0], comp_p[0]
        beta, beta_se, beta_t, beta_p = comp_params[1:-1], comp_se[1:], comp_t[1:], comp_p[1:]
        sigma = comp_params[-1]
        
        if ncomp > 1:
            f.write('\\textbf{Weight %s} & %.4f  & %.4f & & \\\\  \n'%(comp+1, lamb, lamb_se) )
        
        for i in range(k):
            f.write('\\textbf{%s} & %.4f & (%.4f) & %.4f & %.4f \\\\ \n'%(xlabel[i],beta[i],beta_se[i],
                                                                             beta_t[i],beta_p[i]) )
        
        #f.write('\\textbf{Variance} & %s &  & & \\\\ \\\\ \n'%(sigma) )
    f.write('\\hline \\\\ \n')    
    f.write('\end{tabular} \n')
    f.close()
    
    #print output
    f = open(fname, "r")
    print(f.read())
    f.close()

In [5]:
#variables names
lmilk = ['LSCORE']
auct_key = ['YEAR','MONTH','DAY','SYSTEM','FMOZONE']
lcts = ['LFMO','LGAS','LPOPUL','LQSCORE']#,'LMEALS']
dummies = ['COOLER','ESC', 'NUM']
fekeys = ['3','6','7','9']


bid_key = auct_key + ['VENDOR'] + ['COUNTY']
covariates = lcts + dummies + fekeys

reg0 = pd.read_csv('data/clean_milk0.csv')
reg1 = reg0.copy()[bid_key + lmilk + covariates + ['INC']]
reg1 = reg1.dropna()

In [6]:
nice_ww = 'Bids (log-log)'
nice_cov = ['(Intercept)', 'Raw milk', 'Gas',
            'Population', 'Quantity', #'Meals',
            'Cooler', 'Escalated','No. Bidders', #+ fekeys
            'Waco','St. Angelo', 'Austin', 'San Antonio']

In [7]:
#baseline ols

est1 = estimate(reg1['LSCORE'],sm.add_constant(reg1[covariates]),1)
write_table('results/ols_results.tex', est1, labels=(nice_ww, nice_cov))

  return ptp(axis=axis, out=out, **kwargs)


\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Bids (log-log) & \textbf{  R-squared: } &  0.1633 \\ 
\textbf{No. Observations:} & 4056 & \textbf{ AIC:} & -8088.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
 & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline 
\textbf{(Intercept)} & -2.2626 & (0.0768) & -29.4662 & 0.0000 \\ 
\textbf{Raw milk} & 0.2045 & (0.0282) & 7.2485 & 0.0000 \\ 
\textbf{Gas} & 0.0237 & (0.0043) & 5.5780 & 0.0000 \\ 
\textbf{Population} & 0.0153 & (0.0032) & 4.7198 & 0.0000 \\ 
\textbf{Quantity} & -0.0200 & (0.0034) & -5.8783 & 0.0000 \\ 
\textbf{Cooler} & 0.0180 & (0.0031) & 5.7958 & 0.0000 \\ 
\textbf{Escalated} & -0.0270 & (0.0029) & -9.3216 & 0.0000 \\ 
\textbf{No. Bidders} & 0.0055 & (0.0014) & 3.8540 & 0.0001 \\ 
\textbf{Waco} & -0.0706 & (0.0044) & -16.1964 & 0.0000 \\ 
\textbf{St. Angelo} & -0.0427 & (0.0129) & -3.3236 & 0.0009 \\ 
\textbf{Austin} & -0.0897 & (0.0149) & -6.0355 & 0.0000 \\ 
\textbf{Sa



In [8]:
est2 = estimate(reg1['LSCORE'],sm.add_constant(reg1[covariates + ['INC']]),1)
write_table('results/inc_results.tex', est2, labels=(nice_ww, nice_cov + ['Incumbency']))

\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Bids (log-log) & \textbf{  R-squared: } &  0.1802 \\ 
\textbf{No. Observations:} & 4056 & \textbf{ AIC:} & -8086.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
 & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline 
\textbf{(Intercept)} & -2.2448 & (0.0760) & -29.5241 & 0.0000 \\ 
\textbf{Raw milk} & 0.1998 & (0.0279) & 7.1528 & 0.0000 \\ 
\textbf{Gas} & 0.0251 & (0.0042) & 5.9672 & 0.0000 \\ 
\textbf{Population} & 0.0145 & (0.0032) & 4.5046 & 0.0000 \\ 
\textbf{Quantity} & -0.0188 & (0.0034) & -5.5759 & 0.0000 \\ 
\textbf{Cooler} & 0.0177 & (0.0031) & 5.7561 & 0.0000 \\ 
\textbf{Escalated} & -0.0281 & (0.0029) & -9.7850 & 0.0000 \\ 
\textbf{No. Bidders} & 0.0029 & (0.0014) & 1.9762 & 0.0482 \\ 
\textbf{Waco} & -0.0718 & (0.0043) & -16.6312 & 0.0000 \\ 
\textbf{St. Angelo} & -0.0407 & (0.0127) & -3.1965 & 0.0014 \\ 
\textbf{Austin} & -0.0904 & (0.0147) & -6.1419 & 0.0000 \\ 
\textbf{Sa



In [9]:
inc_cov =  pd.DataFrame()

for inc_type in ['Inc', '(1 - Inc)']:
    if inc_type == 'Inc':
        inc_cov['Inc'] = reg1['INC']
    else:
        inc_cov['(1 - Inc)'] = (1 - reg1['INC'])
    for i in range(len(covariates)):
        inc_cov[ inc_type + ' * ' + nice_cov[i+1] ] = reg1[covariates[i]]* inc_cov[inc_type]

    
print(inc_cov.columns)

est25 = estimate(reg1['LSCORE'],inc_cov,1)
write_table('results/inc2_results.tex', est25, labels=(nice_ww, list(inc_cov.columns)))

Index(['Inc', 'Inc * Raw milk', 'Inc * Gas', 'Inc * Population',
       'Inc * Quantity', 'Inc * Cooler', 'Inc * Escalated',
       'Inc * No. Bidders', 'Inc * Waco', 'Inc * St. Angelo', 'Inc * Austin',
       'Inc * San Antonio', '(1 - Inc)', '(1 - Inc) * Raw milk',
       '(1 - Inc) * Gas', '(1 - Inc) * Population', '(1 - Inc) * Quantity',
       '(1 - Inc) * Cooler', '(1 - Inc) * Escalated',
       '(1 - Inc) * No. Bidders', '(1 - Inc) * Waco', '(1 - Inc) * St. Angelo',
       '(1 - Inc) * Austin', '(1 - Inc) * San Antonio'],
      dtype='object')
\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Bids (log-log) & \textbf{  R-squared: } &  0.2049 \\ 
\textbf{No. Observations:} & 4056 & \textbf{ AIC:} & -8064.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
 & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline 
\textbf{Inc} & -2.4033 & (0.1498) & -16.0476 & 0.0000 \\ 
\textbf{Inc * Raw milk} & 0.2685 & (0.0553) & 4.8536 & 0.0000 \\ 
\



In [10]:
#swtiching regression
est3 = estimate(reg1['LSCORE'],sm.add_constant(reg1[covariates]),2)
write_table('results/prelim_results.tex', est3, labels=(nice_ww, nice_cov))

\begin{tabular}{lclc} 
\hline 
\textbf{Dep. Variable:} & Bids (log-log) & \textbf{  R-squared: } &  0.4344 \\ 
\textbf{No. Observations:} & 4056 & \textbf{ AIC:} & -8060.0 \\ 
\end{tabular} 

\begin{tabular}{lcccc} 
\hline 
\textbf{Component 1} & \textbf{Estimate} & \textbf{Std. Error} &\textbf{t} & \textbf{P $>$ $|$ t $|$} \\ 
\hline 
\textbf{Weight 1} & 0.6913  & 0.2427 & & \\  
\textbf{(Intercept)} & -2.1298 & (0.0850) & -25.0452 & 0.0000 \\ 
\textbf{Raw milk} & 0.1675 & (0.0314) & 5.3349 & 0.0000 \\ 
\textbf{Gas} & 0.0025 & (0.0046) & 0.5446 & 0.5861 \\ 
\textbf{Population} & 0.0254 & (0.0037) & 6.8061 & 0.0000 \\ 
\textbf{Quantity} & -0.0234 & (0.0039) & -5.9864 & 0.0000 \\ 
\textbf{Cooler} & 0.0150 & (0.0034) & 4.4124 & 0.0000 \\ 
\textbf{Escalated} & -0.0221 & (0.0032) & -6.8356 & 0.0000 \\ 
\textbf{No. Bidders} & 0.0007 & (0.0016) & 0.4604 & 0.6453 \\ 
\textbf{Waco} & -0.0282 & (0.0047) & -5.9694 & 0.0000 \\ 
\textbf{St. Angelo} & -0.0344 & (0.0139) & -2.4767 & 0.0133 \\ 
\text

In [11]:
test1 = estimate(Y, sm.add_constant(X), 2)
test2 = estimate(Y, sm.add_constant(X), 1)

def nonnested_test(model1,model2):
    """test for non nested models quang vuong"""
    
    params1, se1, r21, y1, x1, ncomp1, classes1, ll1 = model1
    params2, se2, r22, y2, x2, ncomp2, classes2, ll2 = model2
    nobs, k = x1.shape
    
    k1 = params1.shape[1]*ncomp1 - 1 
    k2 = params2.shape[1]*ncomp2 - 1
    
    var1 = (ll1 -ll2).std()
    test1 = (ll1.sum() - ll2.sum() - k1 + k2)*nobs**(-.5)
    test1 = test1/var1
    p1 = 1 - stats.t.cdf(np.abs(test1),df=(nobs-k1-k2)) + stats.t.cdf(-np.abs(test1),df=(nobs-k1-k2))
    
    var2 =  ((ll1 - ll2)**2).mean()**.5
    test2 = (ll1.sum() - ll2.sum() - k1 + k2 )*nobs**(-.5)
    test2 = test2/var2
    p2 = 1 - stats.t.cdf(np.abs(test2),df=(nobs-k1-k2)) + stats.t.cdf(-np.abs(test2),df=(nobs-k1-k2))
    
    return test1, test2, p1, p2

print(nonnested_test(est3,est25))

(6.797416115371161, 6.758274115107681, 1.2226890404786675e-11, 1.598117541364521e-11)


In [12]:
def write_nonnested(model1,model2,fname):
    test1, test2, p1, p2 = nonnested_test(model1,model2)
    test1, test2, p1, p2 = np.round(test1,4), np.round(test2,4), np.round(p1,4), np.round(p2,4) 
    f = open(fname, "w+")
    f.write('\\begin{tabular}{lcc}')
    f.write('\n\\hline \n & \\textbf{t} & \\textbf{P $>$ $|$ t $|$} \\\\')
    f.write('\n\\hline')
    f.write('\n\\textbf{Test 1} & %s & %s \\\\'%(test1,p1))
    f.write('\n\\textbf{Test 2} & %s & %s \\\\'%(test2,p2))
    f.write('\\hline \\\\ \n')   
    f.write('\n\\end{tabular}\n')
    f.close()

write_nonnested(est25,est3,'results/test_stat.tex')