In [1]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel
import matplotlib.pyplot as plt
from scipy import stats

In [2]:
#generate some fake data

n = 1000
beta01, beta11 = 5,-3
beta02, beta12 = 2, 4

#set up regression mixture
x1 = np.random.uniform(0, 10, size=400)
x2 = np.random.uniform(0, 2, size=600)

y1 = beta01 + beta11*x1 + np.random.normal(size=400)
y2 = beta02 + beta12*x2 + np.random.normal(size=600)

x = np.concatenate([x1, x2])
y = np.concatenate([y1, y2])


#set up 2 component mixture
a1 = np.random.normal(0, 1, size=500)
a2 = np.random.normal(5, 3, size=500)
a = np.concatenate([a1,a2])

In [21]:
class Clusters(GenericLikelihoodModel):
    
    def __init__(self, endog, exog, ncomp=2):
        super(Clusters, self).__init__(endog, exog)
        
        nobs, k = self.exog.shape
        self.ncomp = ncomp
        self.nparams = k*ncomp + 2
        self.weights = np.ones(ncomp)/(1.*ncomp)

        param_names = []
        for comp in range(ncomp):
            for name in self.data.xnames:
                param_names.append(name+str(comp))
            param_names.append('sigma'+str(comp))
        self.data.xnames = param_names
        
    
    def nloglikeobs(self, params, v=False):
        """do maximum likelihood estimation"""
        nobs, k = self.exog.shape
        comp_likes = []
        for comp in range(self.ncomp):
            comp_params = params[comp*(k+1): (comp+1)*(k + 1)]
            beta = comp_params[:-1]
            sigma = comp_params[-1]
            
            beta = np.tile(beta,nobs).reshape(nobs, k)
            resid = self.endog - (beta*self.exog).sum(axis=1)
            like = stats.norm.pdf(resid, loc=0, scale=sigma)
            comp_likes.append(like)
            
            
        weights = np.tile(self.weights, nobs).reshape(nobs, self.ncomp)
        comp_likes = weights*np.array(comp_likes).transpose()
        
        return -np.log(comp_likes.sum(axis=1)).sum()
    
    
    def fit(self, start_params=None, maxiter=1000, maxfun=5000, **kwds):
        """print that we did it"""
        tol = 1e-7
        nobs, k = self.exog.shape
        
        #loop variables
        maxiter = 50
        diff = 1
        model = None
        
        while diff > tol and maxiter >=0 :

            start = np.array([1.]*len(self.data.xnames))
            model = super(Clusters, self).fit(disp=False, start_params= start)
            weights = []
            
            #recompute weights
            for comp in range(self.ncomp):
                comp_params = model.params[comp*(k+1): (comp+1)*(k + 1)]
                beta = comp_params[:-1]
                sigma = comp_params[-1]
                
                beta = np.tile(beta,nobs).reshape(nobs, k)
                resid = self.endog - (beta*self.exog).sum(axis=1)
                weights.append( stats.norm.pdf(resid, loc=0, scale=sigma) )
            
            #update loop variables
            weights = np.array(weights)
            denom = np.tile(weights.sum(axis =0),(self.ncomp,1))
            weights = weights/denom
            weights = weights.mean(axis=1)

            #this is hacky
            diff = np.sort(weights)- np.sort(self.weights)
            diff = np.abs(diff).mean()
            
            maxiter = maxiter-1
            self.weights = weights
        
        if maxiter > 0:
            print '[EM convergence achieved] \n'
            print'========================================================'
        
        print 'weights: %s'%self.weights
        print'========================================================\n'
        
        model = super(Clusters, self).fit(start_params= start)
        return model
        
#test case #1 - easy 2 component mixture
model = Clusters(a, np.ones( (a.shape[0],1)))
result = model.fit()

print result.summary()

weights: [0.49368325 0.50631675]

Optimization terminated successfully.
         Current function value: 2.432576
         Iterations: 200
         Function evaluations: 340
                               Clusters Results                               
Dep. Variable:                      y   Log-Likelihood:                -2432.6
Model:                       Clusters   AIC:                             4867.
Method:            Maximum Likelihood   BIC:                             4872.
Date:                Mon, 22 Apr 2019                                         
Time:                        12:39:33                                         
No. Observations:                1000                                         
Df Residuals:                     999                                         
Df Model:                           0                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------

In [5]:
#load data
data = pd.read_csv('data/milk_join.csv')
data.set_index('ROW')

print data.columns

Index([u'ROW', u'SYSTEM', u'COUNTY', u'MRKTCODE', u'VENDOR', u'MONTH', u'DAY',
       u'YEAR', u'LFC', u'LFW', u'WW', u'WC', u'QLFC', u'QLFW', u'QWW', u'QWC',
       u'ESTQTY', u'QUANTITY', u'FMOZONE', u'DEL', u'ESC', u'COOLER', u'MILES',
       u'WIN', u'NUMSCHL', u'NUMWIN', u'I', u'GAS', u'FMO', u'N', u'BACKLOG'],
      dtype='object')


In [13]:
reg1 = data[['WW','FMO']].dropna()
reg1 = reg1[reg1['WW']<.4]
reg1 = np.log(reg1)

model = Clusters( reg1['WW'] , np.ones((reg1.shape[0],1)) )
model.weights = np.array([.3,.7])
result = model.fit()
print result.summary()

ValueError: cannot reshape array of size 0 into shape (3925,1)

In [5]:
#ESC, COOLER ,MILES, GAS, DEL, ESTQTY
reg1 = data[['WW', 'FMO','ESC', 'NUMSCHL','MILES','COOLER', 'GAS','DEL','ESTQTY']].dropna()

#make a little better
reg1['NUMSCHL'] = np.maximum(reg1['NUMSCHL'],1)
reg1['NOSTOP'] = reg1['NUMSCHL']*reg1['DEL']
reg1['MILES'] =reg1['MILES']+1
reg1 = reg1[reg1['WW']<=.3]

#log scale
reg1[['WW', 'FMO', 'MILES', 'GAS','NOSTOP', 'ESTQTY']] = np.log(
                                            reg1[['WW', 'FMO', 'MILES', 'GAS', 'NOSTOP', 'ESTQTY']])

print reg1.mean()

WW         -1.768216
FMO         2.672613
ESC         0.737441
NUMSCHL    12.544541
MILES       3.456749
COOLER      0.736772
GAS         2.864354
DEL         4.262827
ESTQTY     12.857605
NOSTOP      3.346037
dtype: float64


In [8]:
model = sm.OLS(reg1['WW'],sm.add_constant(reg1[['ESC', 'COOLER', 'FMO','NOSTOP','ESTQTY']]))
result = model.fit()
print result.summary()

                            OLS Regression Results                            
Dep. Variable:                     WW   R-squared:                       0.051
Model:                            OLS   Adj. R-squared:                  0.048
Method:                 Least Squares   F-statistic:                     16.07
Date:                Sun, 21 Apr 2019   Prob (F-statistic):           1.85e-15
Time:                        18:26:34   Log-Likelihood:                 1957.9
No. Observations:                1493   AIC:                            -3904.
Df Residuals:                    1487   BIC:                            -3872.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.4310      0.097    -25.003      0.0

In [111]:
#working on clusters

model = Clusters(reg1['WW'],sm.add_constant(reg1[['ESC', 'FMO']]))
result = model.fit()
print result.summary()



[EM convergence achieved] 

weights: [0.4997528 0.5002472]

Optimization terminated successfully.
         Current function value: 0.921971
         Iterations: 390
         Function evaluations: 619
                               Clusters Results                               
Dep. Variable:                     WW   Log-Likelihood:                -1376.5
Model:                       Clusters   AIC:                             2759.
Method:            Maximum Likelihood   BIC:                             2775.
Date:                Sun, 21 Apr 2019                                         
Time:                        18:25:38                                         
No. Observations:                1493                                         
Df Residuals:                    1490                                         
Df Model:                           2                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
----------

