In [3]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel
import matplotlib.pyplot as plt
from scipy import stats

In [4]:
n = 1000
beta01, beta11 = 5,-3
beta02, beta12 = 2, 4

#set up regression mixture
x1 = np.random.uniform(0, 10, size=400)
x2 = np.random.uniform(0, 10, size=600)

y1 = beta01 + beta11*x1 + np.random.normal(scale=2.0, size=400)
y2 = beta02 + beta12*x2 + np.random.normal(scale=4.0,size=600)

X = np.concatenate([x1, x2])
Y = np.concatenate([y1, y2])


#set up 2 component mixture
a1 = np.random.normal(2, 5, size=600)
a2 = np.random.normal(5, 3, size=400)
a = np.concatenate([a1,a2])

In [63]:
def e_step(y,x,params): 
    nobs, k = x.shape
    weights = []
    
    for beta in params:
        sigma = beta[-1]
        beta = np.tile(beta[1:-1],nobs).reshape(nobs, k)
        mean = (beta*x).sum(axis=1)
        weights.append( stats.norm.pdf(y, loc=mean, scale=sigma) )

    #update loop variables
    weights = np.array(weights).transpose()
    denom = np.repeat(weights.sum(axis= 1),len(params)).reshape(nobs,len(params))
    weights = (weights/denom)
    return weights
        
    
def m_step(y,x,weights):
    nobs, k = x.shape
    params = []
    for w in weights.transpose():
        lamb = w.mean()
        
        #weighted OLS estimator
        w_mat = np.diag(w)
        beta = np.linalg.inv(x.transpose().dot( w_mat).dot(x)).dot(x.transpose().dot( w_mat )).dot(y)

        
        mu = np.tile(beta, nobs).reshape(nobs, k)*x
        sigma = w*(y - mu.sum(axis=1))
        sigma = sigma.std()
        
        comp_param =np.concatenate(([lamb],beta,[sigma]))
        params.append(comp_param)
        
    return np.array(params)


def gen_weights(nobs,ncomp):
    np.random.seed(0)
    weights = np.random.uniform(size=(nobs,ncomp))
    denom = np.repeat(weights.sum(axis= 1),ncomp).reshape(nobs,ncomp)
    return (weights/denom)


def estimate(y,x,ncomp):
    e = gen_weights(len(x),ncomp)
    m = None
    for i in range(10):
        m = m_step(y,x,e)
        e = e_step(y,x,m)
    return m


def bootstrap(y,x,ncomp):
    nobs, k = x.shape
    bootstr= []
    for i in range(50):
        #draw subsample
        subn = 300
        sample = np.random.choice(np.arange(0,nobs), subn , replace=True)
        ys = y[sample]
        xs = x[sample]
            
  
        #compute bands
        boot_model = estimate(ys,xs,ncomp)
        bootstr.append(boot_model)
        
    bootstr = np.array(bootstr)
    return bootstr.std(axis=0)


def print_result(y,x,ncomp):
    p = estimate(y,x,ncomp)
    se = bootstrap(y,x,ncomp)
    for i in range(ncomp):
        
        print 'lambda: %s (%s) \nbeta: %s (%s) \nsigma: %s (%s)'%(np.round(p[i][0],3), np.round(se[i][0],3),
                                                                np.round(p[i][1:-1],3), np.round(se[i][1:-1],3),
                                                                np.round(p[i][-1],3), np.round(se[i][-1],3))
        print '========================'
    

print_result(Y,sm.add_constant(X),2)

lambda: 0.409 (0.075) 
beta: [ 5.522 -3.121] ([0.381 0.581]) 
sigma: 1.274 (2.391)
lambda: 0.591 (0.075) 
beta: [2.169 3.964] ([0.501 0.022]) 
sigma: 2.964 (0.408)


In [8]:
data = pd.read_csv('data/milk.csv')
print data.columns

reg1 = data[['WW','FMO']].dropna()
reg1 = reg1[reg1['WW']<.4]

reg1[['WW','FMO']] = np.log(reg1[['WW','FMO']])
params = estimate(np.array(reg1['WW']),np.array(sm.add_constant(reg1['FMO'])),2)
print params

Index([u'VENDOR', u'WW', u'WC', u'LFW', u'LFC', u'WIN', u'SYSTEM', u'YEAR',
       u'MONTH', u'DAY', u'FMOZONE', u'ESC', u'COOLER', u'QLFC', u'QLFW',
       u'QWW', u'QWC', u'ESTQTY', u'DEL', u'MILES', u'NUMSCHL', u'NUMWIN',
       u'POPUL', u'ADJPOP', u'NUM', u'GAS', u'FMO'],
      dtype='object')
[[ 9.44093582e-01 -2.67325310e+00  3.31222719e-01  9.20328698e-02]
 [ 5.59064176e-02 -1.77195684e+00  6.64801547e-13  2.36606456e-13]]
