In [13]:
import pandas as pd
import math
import numpy as np
import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel
import matplotlib.pyplot as plt
from scipy import stats

from statsmodels.tsa.regime_switching.markov_autoregression import MarkovAutoregression
from statsmodels.tsa.regime_switching.markov_regression import MarkovRegression

In [3]:
#generate some fake data

n = 1000
beta01, beta11 = 5,-3
beta02, beta12 = 2, 4

#set up regression mixture
x1 = np.random.uniform(0, 10, size=400)
x2 = np.random.uniform(0, 10, size=600)

y1 = beta01 + beta11*x1 + np.random.normal(scale=2.0, size=400)
y2 = beta02 + beta12*x2 + np.random.normal(scale=4.0,size=600)

x = np.concatenate([x1, x2])
y = np.concatenate([y1, y2])


#set up 2 component mixture
a1 = np.random.normal(0, 1, size=600)
a2 = np.random.normal(5, 3, size=400)
a = np.concatenate([a1,a2])

In [15]:
model = MarkovRegression(endog=y, exog=sm.add_constant(x), k_regimes=2, order=0,
                             trend='nc',switching_exog=True, switching_variance=True)
result = model.fit()
print result.summary()

                        Markov Switching Model Results                        
Dep. Variable:                      y   No. Observations:                 1000
Model:               MarkovRegression   Log Likelihood               -2555.971
Date:                Wed, 24 Apr 2019   AIC                           5127.943
Time:                        08:49:53   BIC                           5167.205
Sample:                             0   HQIC                          5142.865
                               - 1000                                         
Covariance Type:               approx                                         
                             Regime 0 parameters                              
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          5.0242      0.197     25.444      0.000       4.637       5.411
x1            -3.0143      0.034    -89.041      0.0

In [None]:
class Clusters(GenericLikelihoodModel):
    
    def __init__(self, endog, exog, ncomp=2, switch_var=True):
        super(Clusters, self).__init__(endog, exog)
        
        nobs, k = self.exog.shape
        self.ncomp = ncomp
        self.nparams = k*ncomp + 2
        #self.weights = np.tile(np.ones(ncomp)/(1.*ncomp),(nobs,1))
        
        #random start
        np.random.seed(0)
        weights = np.random.uniform(size=(nobs,ncomp))
        denom = np.repeat(weights.sum(axis= 1),self.ncomp).reshape(nobs,ncomp)
        self.weights = (weights/denom)
        
        #adjust param names
        param_names = []
        for comp in range(ncomp):
            for name in self.data.xnames:
                param_names.append(name+str(comp))
            param_names.append('sigma'+str(comp))
        self.data.xnames = param_names
     
    
    def nloglikeobs(self, params, v=False):
        """do maximum likelihood estimation"""
        nobs, k = self.exog.shape
        comp_likes = []
        for comp in range(self.ncomp):
            comp_params = params[comp*(k+1): (comp+1)*(k + 1)]
            beta = comp_params[:-1]
            sigma = comp_params[-1]
            
            beta = np.tile(beta,nobs).reshape(nobs,k)
            means = (beta*self.exog).sum(axis=1) 
            like = stats.norm.logpdf(self.endog- means, loc=0, scale=sigma)
            comp_likes.append(like)

        comp_likes = np.array(comp_likes).transpose()
        comp_likes = self.weights*comp_likes
        
        return -comp_likes.sum().sum()
    
    
    
    def fit(self, start_params=None, maxiter=1000, maxfun=5000, **kwds):
        """print that we did it"""
        tol = 1e-8
        nobs, k = self.exog.shape
        
        #loop variables
        maxiter = 25
        diff = 1
        model = None
        
        while diff > tol and maxiter >=0 :

            start = np.linspace(1.,5., len(self.data.xnames))
            model = super(Clusters, self).fit(disp=False, start_params=start,
                                             method='nm', maxiter=200, full_output=True,  retall=True)
            weights = []
            
            #recompute weights
            for comp in range(self.ncomp):
                comp_params = model.params[comp*(k+1): (comp+1)*(k + 1)]
                beta = comp_params[:-1]
                sigma = comp_params[-1]
                
                beta = np.tile(beta,nobs).reshape(nobs, k)
                mean = (beta*self.exog).sum(axis=1)
                weights.append( stats.norm.pdf(self.endog - mean, loc=0, scale=sigma) )

            
            #update loop variables
            weights = np.array(weights).transpose()
            denom = np.repeat(weights.sum(axis= 1),self.ncomp).reshape(nobs,self.ncomp)
            weights = (weights/denom)
            
            
            #this is hacky
            diff = np.sort(weights, axis=1)- np.sort(self.weights, axis=1)
            diff = np.abs(diff).mean()
            
            maxiter = maxiter-1
            self.weights = weights
        
        if maxiter > 0:
            print '[EM convergence achieved] \n'
            print'========================================================'
        
        print 'weights: %s'%self.weights.mean(axis=0)
        print'========================================================\n'
        
        return model
 
        
#test case #1 - easy 2 component mixture
model = Clusters(y, sm.add_constant(x))
                               
result = model.fit()
print result.summary()
print result.mle_retvals

In [134]:
print dir(result.mle_retvals)
print  result.mle_retvals.viewvalues()

['__class__', '__cmp__', '__contains__', '__delattr__', '__delitem__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__iter__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'clear', 'copy', 'fromkeys', 'get', 'has_key', 'items', 'iteritems', 'iterkeys', 'itervalues', 'keys', 'pop', 'popitem', 'setdefault', 'update', 'values', 'viewitems', 'viewkeys', 'viewvalues']
dict_values([False, 2, 318, 2.6198274605307135, 200])


In [12]:
#load data
data = pd.read_csv('data/milk.csv')
print data.columns

Index([u'VENDOR', u'WW', u'WC', u'LFW', u'LFC', u'SYSTEM', u'YEAR', u'MONTH',
       u'DAY', u'FMOZONE', u'ESC', u'COOLER', u'QLFC', u'QLFW', u'QWW', u'QWC',
       u'ESTQTY', u'DEL', u'MILES', u'NUMSCHL', u'NUMWIN', u'POPUL', u'ADJPOP',
       u'NUM', u'GAS', u'FMO'],
      dtype='object')


In [None]:
reg1 = data[['WW','FMO']].dropna()
reg1 = reg1[reg1['WW']<.4]
reg1 = np.log(reg1)

model = Clusters( reg1['WW'] , np.ones((reg1.shape[0],1)) )
model.weights = np.array([.3,.7])
result = model.fit()
print result.summary()

In [5]:
#ESC, COOLER ,MILES, GAS, DEL, ESTQTY
reg1 = data[['WW', 'FMO','ESC', 'NUMSCHL','MILES','COOLER', 'GAS','DEL','ESTQTY']].dropna()

#make a little better
reg1['NUMSCHL'] = np.maximum(reg1['NUMSCHL'],1)
reg1['NOSTOP'] = reg1['NUMSCHL']*reg1['DEL']
reg1['MILES'] =reg1['MILES']+1
reg1 = reg1[reg1['WW']<=.3]

#log scale
reg1[['WW', 'FMO', 'MILES', 'GAS','NOSTOP', 'ESTQTY']] = np.log(
                                            reg1[['WW', 'FMO', 'MILES', 'GAS', 'NOSTOP', 'ESTQTY']])

print reg1.mean()

WW         -1.768216
FMO         2.672613
ESC         0.737441
NUMSCHL    12.544541
MILES       3.456749
COOLER      0.736772
GAS         2.864354
DEL         4.262827
ESTQTY     12.857605
NOSTOP      3.346037
dtype: float64


In [8]:
model = sm.OLS(reg1['WW'],sm.add_constant(reg1[['ESC', 'COOLER', 'FMO','NOSTOP','ESTQTY']]))
result = model.fit()
print result.summary()

                            OLS Regression Results                            
Dep. Variable:                     WW   R-squared:                       0.051
Model:                            OLS   Adj. R-squared:                  0.048
Method:                 Least Squares   F-statistic:                     16.07
Date:                Sun, 21 Apr 2019   Prob (F-statistic):           1.85e-15
Time:                        18:26:34   Log-Likelihood:                 1957.9
No. Observations:                1493   AIC:                            -3904.
Df Residuals:                    1487   BIC:                            -3872.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.4310      0.097    -25.003      0.0

In [111]:
#working on clusters

model = Clusters(reg1['WW'],sm.add_constant(reg1[['ESC', 'FMO']]))
result = model.fit()
print result.summary()



[EM convergence achieved] 

weights: [0.4997528 0.5002472]

Optimization terminated successfully.
         Current function value: 0.921971
         Iterations: 390
         Function evaluations: 619
                               Clusters Results                               
Dep. Variable:                     WW   Log-Likelihood:                -1376.5
Model:                       Clusters   AIC:                             2759.
Method:            Maximum Likelihood   BIC:                             2775.
Date:                Sun, 21 Apr 2019                                         
Time:                        18:25:38                                         
No. Observations:                1493                                         
Df Residuals:                    1490                                         
Df Model:                           2                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
----------

