Implement Hidden Markov Model with categorical observations.
Use Expectation-Maximization to find the parameters.

The expectation step uses forward-backward algorithm

Notations follow Chapter 17, MLaPP, Kevin Murphy

Example: use historical `bullish to bearish ratio` obtained from American Association of Individual Investors website as an example data set


Formula used:
    
Forward:
$
\mathbf{\alpha}_t \propto \mathbf{\phi}_t \odot (A^T \mathbf{\alpha}_{t-1})  \hspace{500pt}
$

Backward:
$
\mathbf{\beta}_t = A (\mathbf{\phi}_{t+1} \odot \mathbf{\beta}_{t+1})   \hspace{500pt}
$

Forward-Backward:

1) $\gamma_t(j) \equiv P(z_t=j | x_{1:T})$  \hsapce{500pt}

$
\mathbf{\gamma}_t \propto \alpha_t \odot \beta_t   \hspace{500pt}
$
    
2) two-slice marginal

$
\xi_{t,t+1} \propto A \odot \big(\alpha_t ( \phi_{t+1} \odot \beta_{t+1})^T \big)   \hspace{500pt}
$


In [1]:
import numpy as np
import pandas as pd

In [75]:
class HMM(object):
    
    def __init__(self, obs, num_states):
        self.obs = obs
        self.K = num_states
        self.m = obs.max() + 1
        
    def normalize(self, p):
        norm = np.sum(p)
        p = p/norm
        return p, norm
    
    def forward(self, pi, A, B, obs):
        # pi: lenght-K, prior distribution over K states
        # A: KxK transition matrix, A[i, j] is the transition probability from state i to state j
        # B: mxK, each column is a state specific distribution over observations
        # obs: one given observation        
        K = A.shape[0]
        T = obs.size
        alpha = np.zeros((K, T))
        norms = np.zeros(T)
        
        alpha[:, 0] = B[obs[0], :] * pi
        alpha[:, 0], norms[0] = self.normalize(alpha[:, 0])
        for t in range(1, T):
            alpha[:, t] = B[obs[t], :] * (A.T.dot(alpha[:, t-1]))
            alpha[:, t], norms[t] = self.normalize(alpha[:, t])
        return alpha, norms
    
    def backward(self, pi, A, B, obs, alpha, norms):
        # pi: lenght-K, prior distribution over K states
        # A: KxK transition matrix, A[i, j] is the transition probability from state i to state j
        # B: mxK, each column is a state specific distribution over observations
        # obs: one given observation        
        # alpha: from the forward algorithm        
        K = A.shape[0]
        T = obs.size
        beta = np.zeros((K, T))
        beta[:, T-1] = 1
        beta[:, T-1] = beta[:, T-1] / norms[T-1]
        for t in range(T-2, -1, -1):
            beta[:, t] = A.dot(B[obs[t+1], : ] * beta[:, t+1])
            beta[:, t] = beta[:, t] / norms[t]
            # beta[:, t] is normalized with the same factor that normalized alpha[:, t], this
            #  ensures that gamma[:, t] is normalized
        return beta
    
    def forward_backward(self, A, B, obs, alpha, beta):
        # A: KxK transition matrix, A[i, j] is the transition probability from state i to state j
        # B: mxK, each column is a state specific distribution over observations
        # obs: one given observation
        # alpha: from the forward algorithm        
        # beta: from the backward algorithm
        K = A.shape[0]
        T = obs.size
        gamma = np.zeros((K, T))
        for t in range(T):
            gamma[:, t] = alpha[:, t] * beta[:, t]
            gamma[:, t], _ = self.normalize(gamma[:, t])
        
        Xi = np.zeros((K, K))
        for t in range(T-1):
            tmp = A * np.outer(alpha[:, t], B[obs[t+1], :] * beta[:, t+1])
            tmp = tmp/np.sum(tmp)
            Xi += tmp
        
        return gamma, Xi
    
    def expectation(self, pi, A, B, obs):
        alpha, norms = self.forward(pi, A, B, obs)
        beta = self.backward(pi, A, B, obs, alpha, norms)
        gamma, Xi = self.forward_backward(A, B, obs, alpha, beta)
        return alpha, norms, beta, gamma, Xi
    
    def maximization(self, gamma, Xi, obs):
        K = self.K
        m = self.m
        T = obs.size
        
        pi = gamma[:, 0].copy()
        
        A = Xi.copy()
        for i in range(K):
            A[i, :], _ = self.normalize(A[i, :])
            
        B = np.zeros((m, K))
        for l in range(m):
            for t in range(T):
                if obs[t] == l:
                    B[l, :] += gamma[:, t]
        for j in range(K):
            B[:, j], _ = self.normalize(B[:, j])
        
        return pi, A, B
    
    def train_EM(self, n_iters=1000):
        
        printing_frequency = n_iters // 10
        
        # initialization
        K = self.K
        m = self.m
        obs = self.obs
        T = obs.size
        
        # Initialization
        pi = np.random.random(K)
        pi, _ = self.normalize(pi)
        
        A = np.random.random((K, K))
        for j in range(K):
            A[j, :], _ = self.normalize(A[j, :])

        B = np.random.random((m, K))
        for j in range(K):
            B[:, j], _ = self.normalize(B[:, j])
            
        # EM iterations    
        for counter in range(n_iters):            
            alpha, norms, beta, gamma, Xi = self.expectation(pi, A, B, obs)
            pi, A, B = self.maximization(gamma, Xi, obs)
            if counter % printing_frequency == 0:
                cost_function = -np.sum(np.log(norms))
                print('iteration: {0:}, cost function: {1:}'.format(counter, cost_function))
        return pi, A, B
            

In [2]:
# http://www.aaii.com/sentimentsurvey/sent_results
data = pd.read_csv('AAII_raw.csv', header=0)

data['ratio'] = data.eval('Bullish / Bearish')
obs_max = data['ratio'].max()
obs_min = data['ratio'].min()
obs_delta = obs_max - obs_min

data['scaled_obs'] = data['ratio'].apply(lambda x: (x - obs_min)/obs_delta)
buckets = np.linspace(0, 1, 6)
data['digitized_obs'] = np.digitize(data['scaled_obs'], buckets) - 1

In [3]:
data.head()

Unnamed: 0,Date,Bullish,Neutral,Bearish,ratio,scaled_obs,digitized_obs
0,7-24-87,0.4,0.5,0.1,4.0,0.490909,2
1,7-31-87,0.3,0.5,0.3,1.0,0.109091,0
2,8-7-87,0.6,0.2,0.3,2.0,0.236364,1
3,8-14-87,0.5,0.4,0.2,2.5,0.3,1
4,8-21-87,0.7,0.3,0.1,7.0,0.872727,4


In [14]:
obs = data['digitized_obs'].values

In [76]:
hmm = HMM(obs, 5)
pi, A, B = hmm.train_EM(1000)

iteration: 0, cost function: 2844.8770392144806
iteration: 100, cost function: 922.5006993878073
iteration: 200, cost function: 921.9363268923867
iteration: 300, cost function: 921.9220978905303
iteration: 400, cost function: 921.9094239988787
iteration: 500, cost function: 921.894498857609
iteration: 600, cost function: 921.8751773086403
iteration: 700, cost function: 921.8488170020198
iteration: 800, cost function: 921.8111983590127
iteration: 900, cost function: 921.7537610625726


In [77]:
pi

array([ 0.,  0.,  0.,  1.,  0.])

In [78]:
for i in range(A.shape[0]):
    row = ''
    for e in A[i, :]:
        e_str =  str(round(e, 4))
        row += e_str + '   '
    print(row)
    

0.7431   0.0292   0.2277   0.0   0.0   
0.1848   0.7334   0.0019   0.0   0.0799   
0.5936   0.3478   0.0586   0.0   0.0   
0.0   0.0038   0.0797   0.0   0.9164   
0.0   0.1623   0.0   0.8106   0.0271   


In [79]:
for i in range(B.shape[0]):
    row = ''
    for e in B[i, :]:
        e_str =  str(round(e, 4))
        row += e_str + '   '
    print(row)
    

0.9867   0.4484   1.0   0.1572   0.2152   
0.0105   0.5516   0.0   0.5257   0.6973   
0.0   0.0   0.0   0.1102   0.0   
0.0028   0.0   0.0   0.1672   0.0645   
0.0   0.0   0.0   0.0227   0.023   
0.0   0.0   0.0   0.017   0.0   
