In [1]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel
import matplotlib.pyplot as plt
from scipy import stats

In [2]:
n = 1000
beta01, beta11 = 5,-3
beta02, beta12 = 2, 4

#set up regression mixture
x1 = np.random.uniform(0, 10, size=400)
x2 = np.random.uniform(0, 10, size=600)

y1 = beta01 + beta11*x1 + np.random.normal(scale=5.0, size=400)
y2 = beta02 + beta12*x2 + np.random.normal(scale=4.0,size=600)

X = np.concatenate([x1, x2])
Y = np.concatenate([y1, y2])


#set up 2 component mixture
a1 = np.random.normal(2, 5, size=600)
a2 = np.random.normal(5, 3, size=400)
a = np.concatenate([a1,a2])

In [3]:
def e_step(y,x,params): 
    nobs, k = x.shape
    weights = []
    for param in params:
        sigma = param[-1]
        beta = np.tile(param[1:-1],nobs).reshape(nobs, k)
        mean = (beta*x).sum(axis=1)
        weights.append( stats.norm.pdf(y, loc=mean, scale=sigma) )

    #update loop variables
    weights = np.array(weights).transpose()
    denom = np.repeat(weights.sum(axis= 1),len(params)).reshape(nobs,len(params))
    weights = (weights/denom)
    return weights
        
    
def m_step(y,x,weights):
    nobs, k = x.shape
    params, se, err = [], [], 0
    
    #do lambda estimates first becaues of order
    lambs = weights.mean(axis=0)
    lamb_ses = (np.sort(weights)).std(axis=0)
    comp_order = stats.rankdata(weights.mean(axis=0),method='ordinal')
    lamb_ses = lamb_ses[comp_order-1]
    
    
    for i in comp_order:
        try:
            #component estimates
            w = weights[:,i-1]
            lamb = lambs[i-1]
            lamb_se = lamb_ses[i-1]

            #beta
            w_mat = np.diag(w)
            xx_mat = np.linalg.inv( x.transpose().dot( w_mat).dot(x) )
            beta = xx_mat.dot(x.transpose().dot(w_mat)).dot(y)

            #sigma
            mu = np.tile(beta, nobs).reshape(nobs, k)*x
            weighted_err = w*(y - mu.sum(axis=1))
            sigma =  weighted_err.std()

            #add component
            comp_param =np.concatenate(([lamb],beta,[sigma]))
            params.append(comp_param)

            #beta_se
            beta_se = np.diagonal(xx_mat*sigma**2)
            comp_se = np.concatenate(([lamb_se],beta_se))
            se.append(comp_se)

            #SSR
            err = err+weighted_err    
            
        except:        
            params.append(np.concatenate( (np.zeros(k+1),[1]) ))
            se.append( np.zeros(k+1) )
    
    return np.array(params), np.array(se), 1-err.var()/y.var()


def gen_weights(nobs,ncomp):
    weights = np.random.uniform(size=(nobs,ncomp))
    denom = np.repeat(weights.sum(axis= 1),ncomp).reshape(nobs,ncomp)
    return (weights/denom)


def estimate(y,x,ncomp):
    e = gen_weights(len(x),ncomp)
    m = None
    for i in range(15):
        m,se,r2 = m_step(y,x,e)
        e = e_step(y,x,m)
    return m, se, r2, y, x, ncomp

ests  = estimate(Y,sm.add_constant(X),2)
m, se, r2, y, x, ncomp = estimate(Y,sm.add_constant(X),2)

print m
print '----'
print se

[[ 0.40964911  5.75562117 -3.08604976  2.97988644]
 [ 0.59035089  1.55075102  4.0188137   2.71894873]]
----
[[0.12250049 0.08319029 0.00257814]
 [0.12250049 0.05596236 0.00152697]]


In [14]:
data = pd.read_csv('data/milk.csv')
reg1 = data[['WW','FMO','INC','ESC','ESTQTY','DEL','COOLER','NUMSCHL']].dropna()
reg1['QSTOP'] = reg1['ESTQTY']/(reg1['DEL']*reg1['NUMSCHL'])
reg1 = sm.add_constant(reg1)
reg1[['WW','FMO','QSTOP','DEL']] = np.log(reg1[['WW','FMO','QSTOP','DEL']])

est = estimate(np.array(reg1['WW']),np.array(reg1[['const','FMO','QSTOP','DEL']]),2)
print est[0]
print '-----'
print est[1]
print

[[ 2.92314996e-01 -5.65847508e+00  1.42371520e+00  2.34242118e-02
  -8.47684998e-02  1.29560714e-02]
 [ 7.07685004e-01 -2.02277657e+00  1.65106915e-01 -2.23316010e-02
   3.38890713e-03  8.77220414e-02]]
-----
[[1.63157016e-01 7.28920778e-04 7.89128400e-05 1.43088063e-06
  2.90470678e-06]
 [1.63157016e-01 1.04593319e-02 1.07892260e-03 2.46943010e-05
  3.49345995e-05]]



In [15]:
def write_table(fname, estimates, labels=('y',None)):
    
    #unpack relevant information
    params, se, r2, y, x, ncomp = estimates
    nobs, k = x.shape
    ylabel, xlabel = labels
    
    if xlabel == None:
        xlabel =[]
        for i in range(k):
            xlabel.append('x%s'%i)
            
    assert (k == len(xlabel)) 
    
    f = open(fname, "w+")
    
    f.write(('\\begin{center}  \n'+
            '\\begin{tabular}{lclc} \n'+
            '\\toprule \n'+
            '\\textbf{Dep. Variable:} & %s & \\textbf{  R-squared: } &  %s \\\\ \n'%(ylabel, np.round(r2,3))  ))
    
    f.write(('\\textbf{No. Observations:} & %s & & \\\\ \n'%nobs+
            '\end{tabular} \n'))
    
    
    f.write('\n\\begin{tabular}{lcccc} \n')
    for comp in range(ncomp):
        f.write('\\toprule \n')
        f.write('\\textbf{Regime %s} & \\textbf{est} & \\textbf{std err} &'%(1+comp)+ 
                '\\textbf{t} & \\textbf{P $>$ $|$ t $|$} \\\\ \n')
        f.write('\\bottomrule \\\\ \n')
        #isolate parasm
        comp_params = params[comp]
        comp_se = se[comp]
        comp_t = np.abs(comp_params[:-1]/comp_se)
        comp_p = 1 - stats.t.cdf(comp_t,df=nobs)
        
        #round everything
        comp_params = np.round(comp_params,5)
        comp_se = np.round(comp_se,5)
        comp_t = np.round(comp_t,5)
        comp_p = np.round(comp_p,5)
        
        lamb, lamb_se, lamb_t, lamb_p = comp_params[0], comp_se[0], comp_t[0], comp_p[0]
        beta, beta_se, beta_t, beta_p = comp_params[1:-1], comp_se[1:], comp_t[1:], comp_p[1:]
        sigma = comp_params[-1]
        
        
        f.write('\\textbf{lambda} & %s & (%s) & %s & %s \\\\ \\\\ \n'%(lamb,lamb_se,lamb_t,lamb_p) )
        
        for i in range(k):
            f.write('\\textbf{%s} & %s & (%s) & %s & %s \\\\ \\\\ \n'%(xlabel[i],beta[i],beta_se[i],
                                                                             beta_t[i],beta_p[i]) )
        
        f.write('\\textbf{sigma} & %s &  & & \\\\ \\\\ \n'%(sigma) )
    
    f.write('\end{tabular} \n'+
            '\end{center}\n')

write_table('out1.tex', ests)

In [17]:
write_table('results/prelim_results.tex', est, labels=('WW',['const','FMO','QSTOP','DEL']))

In [20]:
est1 = estimate(np.array(reg1['WW']),np.array(reg1[['const','FMO','QSTOP','DEL']]),1)
print est1[0]
print '-----'
print est1[1]
print
write_table('results/ols_results.tex', est, labels=('WW',['const','FMO','QSTOP','DEL']))

[[ 1.00000000e+00 -2.76235511e+00  4.13934513e-01 -1.32301552e-02
  -1.74214796e-04  9.79755345e-02]]
-----
[[0.00000000e+00 9.78463856e-03 1.01459100e-03 2.23149535e-05
  3.37867128e-05]]

