<a href="https://colab.research.google.com/github/cicattzo/mit_advanced_nlp/blob/main/HA1_P3_NLP_MIT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%%bash
!(stat -t /usr/local/lib/*/dist-packages/google/colab > /dev/null 2>&1) && exit 
rm -rf 6864-hw1
git clone https://github.com/mit-6864/hw1.git

Cloning into 'hw1'...


In [3]:
import sys
sys.path.append("/content/hw1")

import csv
import itertools as it
import numpy as np
np.random.seed(0)

import torch
from scipy.special import logsumexp
import scipy

import lab_util

## Hidden Markov Models

In the remaining part of the lab (containing part 3) you'll use the Baum--Welch algorithm to learn _categorical_ representations of words in your vocabulary. Answers to questions in this lab should go in the same report as the initial release.

As before, we'll start by loading up a dataset:

In [4]:
data = []
n_positive = 0
n_disp = 0
with open("/content/hw1/reviews.csv") as reader:
  csvreader = csv.reader(reader)
  next(csvreader)
  for id, review, label in csvreader:
    label = int(label)

    # hacky class balancing
    if label == 1:
      if n_positive == 2000:
        continue
      n_positive += 1
    if len(data) == 4000:
      break

    data.append((review, label))
    
    if n_disp > 5:
      continue
    n_disp += 1
    print("review:", review)
    print("rating:", label, "(good)" if label == 1 else "(bad)")
    print()

print(f"Read {len(data)} total reviews.")
np.random.shuffle(data)
reviews, labels = zip(*data)
train_reviews = reviews[:3000]
train_labels = labels[:3000]
val_reviews = reviews[3000:3500]
val_labels = labels[3000:3500]
test_reviews = reviews[3500:]
test_labels = labels[3500:]

review: I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.
rating: 1 (good)

review: Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".
rating: 0 (bad)

review: This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother an

Next, implement the forward--backward algorithm for HMMs like we saw in class.

**IMPORTANT NOTE**: if you directly multiply probabilities as shown on the class slides, you'll get underflow errors. You'll probably want to work in the log domain (remember that `log(ab) = log(a) + log(b)`, `log(exp(a) + exp(b)) = logaddexp(a, b)`). In general, we recommend either `np.logaddexp` or `scipy.special.logsumexp` as safe ways to compute the necessary quantities.

In [5]:
# hmm model
class HMM(object):
    def __init__(self, num_states, num_words):
        self.num_states = num_states
        self.num_words = num_words

        self.states = range(num_states)
        self.symbols = range(num_words)

        """
        Initialize the matrix A with random transition probabilities p(j|i)
        A should be a matrix of size `num_states x num_states` with rows that
        sum to 1.
        """
         # your code here
        self.A = np.random.rand(self.num_states,self.num_states)
        self.A = self.A/self.A.sum(axis=1)[:,None]
        # self.A = torch.from_numpy(self.A)
        """
        Initialize the matrix B with random emission probabilities p(o|i). B 
        should be a matrix of size `num_states x num_words` with rows that sum 
        to 1.
        """
       # your code here
        self.B = np.random.rand(self.num_states,self.num_words) 
        self.B = self.B/self.B.sum(axis=1)[:,None]
        # self.B = torch.from_numpy(self.B)
        """
        Initialize the vector pi with a random starting distribution. pi should
        be a vector of size `num_states` with entries that sum to 1.
        """
        # your code here
        self.pi = np.random.random(self.num_states)
        self.pi /= self.pi.sum()
        # self.pi = torch.from_numpy(self.pi)
        # self.pi = None ```


    def generate(self, n):
        """randomly sample the HMM to generate a sequence.
        """
        # we'll give you this one

        sequence = []
        # initialize the first state
        state = np.random.choice(self.states, p=self.pi)
        for i in range(n):
            # get the emission probs for this state
            b = self.B[state, :]
            # emit a word
            word = np.random.choice(self.symbols, p=b)
            sequence.append(word)
            # get the transition probs for this state
            a = self.A[state, :]
            # update the state
            state = np.random.choice(self.states, p=a)
        return sequence

    def forward(self, obs):
        """
        Runs the forward algorithm. This function should return a 
        `len(obs) x  num_states` matrix where the (t, i)th entry contains 
        log p(obs[:t], hidden_state_t = i)
        """

        # your code here!

        alpha = np.zeros((len(obs), self.num_states))

        for s in self.states:
          alpha[0][s] = np.log(self.pi[s])+np.log(self.B[s][obs[0]])

        for t in range(1, len(obs)):

          for s in self.states:

            temp = []

            for s_i in self.states:

              temp.append(alpha[t-1][s_i] + np.log(self.A[s_i][s]) + np.log(self.B[s][obs[t]]))

            temp = np.array(temp)

            # alpha[t][s] = sum((alpha[t-1][s_i] * self.A[s_i][s] * self.B[s][obs[t]]) for s_i in self.states)

            alpha[t][s] = logsumexp(temp)

        # alpha = np.log(alpha)

        return alpha

    def backward(self, obs):
        """
        Run the backward algorithm. This function should return a
        `len(obs) x num_states` matrix where the (t, i)th entry contains
        log p(obs[t+1:] | hidden_state_t = i)
        """

        beta = np.zeros((len(obs), self.num_states))

        # your code here!

        for s in self.states:
          beta[len(obs)-1][s] = 1.0

        for t in reversed(range(len(obs)-1)):

          for s in self.states:

            # temp = []

            # for s_i in self.states:
              
              # temp.append(np.log(beta[t+1][s_i]) + np.log(self.A[s][s_i]) + np.log(self.B[s_i][obs[t+1]]))

            # temp = np.array(temp)

            beta[t][s] = sum((beta[t+1][s_i] * self.A[s][s_i] * self.B[s_i][obs[t+1]]) for s_i in self.states)

            # beta[t][s] = logsumexp(temp)

        beta = np.log(beta)

        return beta
        
    def forward_backward(self, obs):
        """
        Compute forward-backward scores

        logprob is the total log-probability of the sequence obs (marginalizing
        over hidden states).

        gamma is a matrix of size `len(obs) x num_states1. It contains the
        marginal probability of being in state i at time t

        xi is a tensor of size `len(obs) x num_states x num_states`. It contains
        the marginal probability of transitioning from i to j at t.
        """
        #create forward and backward
        forward = self.forward(obs)
        forward_exp = np.exp(forward)
        backward = self.backward(obs)
        backward_exp = np.exp(backward)

        #calculate logprob
        logprob = logsumexp(np.array(forward[len(obs)-1]))
        #calculate gamma
        gamma = np.zeros((len(obs), self.num_states))

        xi = np.zeros((len(obs), self.num_states, self.num_states))

        for t in range(len(obs)):

          for s in self.states:

            gamma[t][s] = np.exp(forward[t][s]+backward[t][s]-logprob)

            if t == len(obs)-1:
              continue

            for s_i in self.states:

              xi[t][s][s_i] = np.exp(forward[t][s] + np.log(self.A[s][s_i]) + np.log(self.B[s_i][obs[t+1]]) + backward[t+1][s_i] - logprob)

        # your code here!

        return logprob, xi, gamma

        """
        SANITY CHECK

        The most straightforward way of implementing the forward, backward, and 
        forward_backward methods would be to iterate through all the values and 
        use the formulas in the slides to calculate the corresponding values.

        However, this may not be fast enough. If your model is taking too long
        to train, consider how you may speed up your code by reducing the number
        of for loops involved. How can you reformulate your code using matrix
        operations?

        Hint: we were able to implement each of the forward, backward, and
        forward_backward operations using only one for loop.
        """

    def learn_unsupervised(self, corpus, num_iters, print_every=10):
        """Run the Baum Welch EM algorithm
        
        corpus: the data to learn from
        num_iters: the number of iterations to run the algorithm
        print_every: how often to print the log-likelihood while the model is
        updating its parameters.
        """

        for i_iter in range(num_iters):
            


            # expected_si = None # your code here
            # expected_sij = None # your code here
            # expected_sjwk = None # your code here
            # expected_q1 = None # your code here

            A_new = np.zeros((self.num_states,self.num_states))
            B_new = np.zeros((self.num_states,self.num_words))
            pi_new = np.zeros(self.num_states)

            expected_si = np.zeros(self.num_states) 
            expected_sij = np.zeros((self.num_states, self.num_states)) 
            expected_sjwk = np.zeros((self.num_states, self.num_words)) 
            expected_q1 = np.zeros(self.num_states)
            expected_number_of_times_sj = np.zeros(self.num_states)

            total_logprob = 0
            
            for review in corpus:

                logprob, xi, gamma = self.forward_backward(review)

                total_logprob = total_logprob+logprob 


                for s_i in self.states:

                  expected_q1[s_i] += gamma[0, s_i]

                  expected_number_of_times_sj[s_i] += sum(gamma[t, s_i] for t in range(len(review)))

                  for s_j in self.states:

                    expected_sij[s_i,s_j] += sum(xi[t][s_i][s_j] for t in range(len(review)-1))

                  expected_si[s_i] += sum(expected_sij[s_i, s_x] for s_x in self.states)

                  for t in range(len(review)):

                    expected_sjwk[s_i, review[t]] += gamma[t,s_i]


                for s_i in self.states:

                  pi_new[s_i] = expected_q1[s_i]
                  
                  for s_j in self.states:
                    
                    A_new[s_i, s_j] = expected_sij[s_i,s_j] / expected_si[s_i]

                  for w in range(self.num_words):

                    B_new[s_i, w] = expected_sjwk[s_i, w] / expected_number_of_times_sj[s_i]
            
            A_row_sums = A_new.sum(axis=1)
            A_new = A_new / A_row_sums[:, np.newaxis]

            B_row_sums = B_new.sum(axis=1)
            B_new = B_new / B_row_sums[:, np.newaxis]

            pi_sum = sum(pi_new)

            pi_new = pi_new / pi_sum





            if i_iter % print_every == 0:
              print("log-likelihood", total_logprob)

            """
            The following variables should be the new values of self.A, self.B,
            and self.pi after the values are updated.
            """
            # A_new = None # your code here
            # B_new = None # your code here
            # pi_new = None # your code here

            self.A = A_new
            self.B = B_new
            self.pi = pi_new


In [6]:
def init_test():

    num_states = np.random.randint(100)
    num_words = np.random.randint(100)
    model = HMM(num_states, num_words)

    assert model.A.shape == (num_states, num_states)
    assert model.B.shape == (num_states, num_words)
    assert model.pi.shape == (num_states, )

    assert np.linalg.norm(np.sum(model.A, axis=1) - np.ones(num_states)) < 1e-10
    assert np.linalg.norm(np.sum(model.B, axis=1) - np.ones(num_states)) < 1e-10
    assert np.linalg.norm(np.sum(model.pi) - 1) < 1e-10

def forward_test():
    model = HMM(2, 10)
    model.A = np.array([[0.79034887, 0.20965113],
                        [0.66824331, 0.33175669]])
    model.B = np.array([[0.08511814, 0.06627238, 0.08487461, 0.15607959, 0.00124582, 0.12984083, 0.11164849, 0.11591902, 0.15232716, 0.09667395],
                        [0.18425462, 0.14326559, 0.14026994, 0.0215989,  0.17687124, 0.04681278, 0.05857451, 0.17451212, 0.00473382, 0.04910648]])
    model.pi = np.array([0.77480039, 0.22519961])
    obs = [1, 8, 0, 0, 3, 4, 5, 2, 6, 3, 7, 9]
    alpha = model.forward(obs)

    print("The result of the forward function should be", np.array([[-2.96913, -3.43382],
                                                                    [ -4.66005, -9.19418],
                                                                    [ -7.35001, -7.89695],
                                                                    [ -9.65069, -9.95363],
                                                                    [-11.25815, -14.27392],
                                                                    [-18.14079, -14.4781 ],
                                                                    [-16.89275, -18.62696],
                                                                    [-19.45549, -20.17289],
                                                                    [-21.53772, -23.283  ],
                                                                    [-23.4927, -26.69119],
                                                                    [-25.84891, -26.73817],
                                                                    [-28.12237, -29.92402]]))
    print("Your value of alpha is:", np.round(alpha, 5))

def backward_test():
    model = HMM(2, 10)
    model.A = np.array([[0.79034887, 0.20965113],
                        [0.66824331, 0.33175669]])
    model.B = np.array([[0.08511814, 0.06627238, 0.08487461, 0.15607959, 0.00124582, 0.12984083, 0.11164849, 0.11591902, 0.15232716, 0.09667395],
                        [0.18425462, 0.14326559, 0.14026994, 0.0215989,  0.17687124, 0.04681278, 0.05857451, 0.17451212, 0.00473382, 0.04910648]])
    model.pi = np.array([0.77480039, 0.22519961])
    obs = [1, 8, 0, 0, 3, 4, 5, 2, 6, 3, 7, 9]
    beta = model.backward(obs)

    print("The result of the backward function should be", np.array([[-25.42937, -25.58918], 
                                                                     [-23.32164, -23.19959],
                                                                     [-21.11007, -21.02033],
                                                                     [-18.82215, -18.94381],
                                                                     [-16.78523, -16.33951],
                                                                     [-13.42847, -13.51924],
                                                                     [-11.24815, -11.19161],
                                                                     [ -8.88679,  -8.96441],
                                                                     [ -6.57374,  -6.70985],
                                                                     [ -4.51873,  -4.47419],
                                                                     [ -2.44529,  -2.51463],
                                                                     [  0, 0]]))

    print("Your value of beta is:", np.round(beta, 5))


def forward_backward_test():
    model = HMM(2, 10)
    model.A = np.array([[0.79034887, 0.20965113],
                        [0.66824331, 0.33175669]])
    model.B = np.array([[0.08511814, 0.06627238, 0.08487461, 0.15607959, 0.00124582, 0.12984083, 0.11164849, 0.11591902, 0.15232716, 0.09667395],
                        [0.18425462, 0.14326559, 0.14026994, 0.0215989,  0.17687124, 0.04681278, 0.05857451, 0.17451212, 0.00473382, 0.04910648]])
    model.pi = np.array([0.77480039, 0.22519961])
    obs = [1, 8, 0, 0, 3, 4, 5, 2, 6, 3, 7, 9]
    logprob, xi, gamma = model.forward_backward(obs)

    print("The value of logprob should be:", -27.9693)
    print("Your value of logprob is:", np.round(logprob, 5))

    print("The value of xi should be:", np.array([[[0.64523, 0.00601],
                                                  [0.34278, 0.00598]],

                                                 [[0.60684, 0.38117],
                                                  [0.00551, 0.00648]],

                                                 [[0.40595, 0.2064 ],
                                                  [0.19863, 0.18902]],

                                                 [[0.5718,  0.03278],
                                                  [0.35711, 0.03831]],

                                                 [[0.02625, 0.90266],
                                                  [0.00109, 0.07   ]],

                                                 [[0.02482, 0.00251],
                                                  [0.81777, 0.15489]],

                                                 [[0.59943, 0.24316],
                                                  [0.08947, 0.06793]],

                                                 [[0.6143,  0.07461],
                                                  [0.25347, 0.05762]],

                                                 [[0.8357,  0.03207],
                                                  [0.12337, 0.00886]],

                                                 [[0.69872, 0.26034],
                                                  [0.02412, 0.01682]],

                                                 [[0.63701, 0.08583],
                                                  [0.22134, 0.05582]]]))
    print("Your value of xi is:", np.round(xi, 5))

    print("The value of gamma should be:", np.array([[0.65124, 0.34876],
                                                    [0.98802, 0.01198],
                                                    [0.61235, 0.38765],
                                                    [0.60458, 0.39542],
                                                    [0.92891, 0.07109],
                                                    [0.02733, 0.97267],
                                                    [0.8426,  0.1574 ],
                                                    [0.68891, 0.31109],
                                                    [0.86777, 0.13223],
                                                    [0.95906, 0.04094],
                                                    [0.72284, 0.27716],
                                                    [0.85835, 0.14165]]))

    print("Your value of gamma is:", np.round(gamma, 5))

def baum_welch_update_test():
    model = HMM(4, 10)
    
    model.A = np.array([[0.05263151, 0.62161178, 0.06683182, 0.25892489],
                        [0.26993274, 0.13114741, 0.32305468, 0.27586517],
                        [0.2951958,  0.14576492, 0.22474111, 0.33429817],
                        [0.29586018, 0.26065884, 0.1977772,  0.24570378]])
    
    model.B = np.array([[0.01800425, 0.09767131, 0.17824799, 0.12586453, 0.19514548, 0.05433139, 0.01995667, 0.12985343, 0.01884263, 0.16208232],
                        [0.04512782, 0.09469685, 0.1426164,  0.13851362, 0.08717793, 0.17152532, 0.08746939, 0.04900339, 0.05315859, 0.13071069],
                        [0.11055806, 0.10592473, 0.0051817,  0.07721441, 0.21761783, 0.20323146, 0.18881598, 0.00584989, 0.00682669, 0.07877924],
                        [0.08711377, 0.16703645, 0.0706214,  0.05297571, 0.10486868, 0.16794587, 0.13562053, 0.15729142, 0.03345308, 0.02307309]])
    
    model.pi = np.array([0.21186864, 0.27156561, 0.37188523, 0.14468051])
    
    corpus = np.array([[7,3,2,5,0,3,2,9,4,2], [7,3,2,4,2,8,7,5,0,8], [7,3,2,3,1,7,3,8,6,7], [7,3,2,6,4,4,3,4,0,0]])

    model.learn_unsupervised(corpus, 200)

    print("hmm.A should be", np.array([[0, 1, 0, 0], 
                                     [0.14122, 0, 0.27099, 0.58779], 
                                     [0.20671, 0, 0, 0.79329], 
                                     [0, 0.90909, 0.09091, 0]]))
    print("Your implementation has hmm.A to be", np.round(model.A, 5))

    print("hmm.B should be", np.array([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
                                              [0.0625, 0, 0, 0.5, 0, 0.125, 0.125, 0, 0.125, 0.0625],
                                              [0, 0.20671, 0, 0, 0.79329, 0, 0, 0, 0, 0],
                                              [0.24667, 0, 0.57555, 0, 0.09556, 0, 0, 0, 0.08222, 0]]))
    print("Your implementation has hmm.B to be", np.round(model.B, 5))

    print("hmm.pi should be", np.array([1, 0, 0, 0]))

    print("Your implementation has hmm.pi to be", np.round(model.pi, 5))

def end_to_end_test():
    # Test Case 1

    corpus = np.array([[0,3,0,3,0,3,0,3,0,3,0,3], [0,2,0,2,0,2,0,2,0,2,0,2,0], [1,2,1,2,1,2,1,2,1,2,1,2],[1,3,1,3,1,3,1,3,1,3]])
    hmm = HMM(num_states=2,num_words=4)
    hmm.learn_unsupervised(corpus, 10)
    print("After this test case, hmm.A should either be approximately,",  np.array([[0, 1], [1, 0]]))
    print("This is your current value of hmm.A: ", np.round(hmm.A, 5))

    print("After this test case, hmm.B should either be approximately,", np.array([[0, 0, 0.5, 0.5], [0.5, 0.5, 0, 0]]), " or it should be ", np.array([[0.5, 0.5, 0, 0], [0, 0, 0.5, 0.5]]))
    print("This is your current value of hmm.B: ", np.round(hmm.B, 5))

    # Test Case 2

    corpus = np.array([[0,0,0,0,0,0,0,0,0,0], [1,1,1,1,1,1,1,1,1,1], [2,2,2,2,2,2,2,2,2,2]])
    hmm = HMM(num_states=3, num_words=3)
    hmm.learn_unsupervised(corpus, 100)
    print("After this test case, hmm.A should be the identity matrix", np.eye(3))
    print("This is your current value of hmm.A: ", np.round(hmm.A, 5))

    print("After this test case, hmm.B should be some 3 by 3 permutation matrix")
    print("This is your current value of hmm.B: ", np.round(hmm.B, 5))

## Test Cases

The following are test cases that are meant to help you debug your code. The code involves six test suites - an initialization test, a forward test, a backward test, a forward_backward test, a baum_welch_update test, and a final end_to_end test.

## Test

To actually run the test cases, run the cell below:

In [7]:
init_test()
forward_test()
backward_test()
forward_backward_test()
baum_welch_update_test()
end_to_end_test()

"""
Note: The end_to_end_test is not as robustg due to it using random starts. Try
running the test case a few times to see if you get a good result at least a few
times before deciding that your code is buggy.
"""

The result of the forward function should be [[ -2.96913  -3.43382]
 [ -4.66005  -9.19418]
 [ -7.35001  -7.89695]
 [ -9.65069  -9.95363]
 [-11.25815 -14.27392]
 [-18.14079 -14.4781 ]
 [-16.89275 -18.62696]
 [-19.45549 -20.17289]
 [-21.53772 -23.283  ]
 [-23.4927  -26.69119]
 [-25.84891 -26.73817]
 [-28.12237 -29.92402]]
Your value of alpha is: [[ -2.96913  -3.43382]
 [ -4.66005  -9.19418]
 [ -7.35001  -7.89695]
 [ -9.65069  -9.95363]
 [-11.25815 -14.27392]
 [-18.14079 -14.4781 ]
 [-16.89275 -18.62697]
 [-19.45549 -20.17289]
 [-21.53772 -23.283  ]
 [-23.4927  -26.69119]
 [-25.84891 -26.73817]
 [-28.12237 -29.92402]]
The result of the backward function should be [[-25.42937 -25.58918]
 [-23.32164 -23.19959]
 [-21.11007 -21.02033]
 [-18.82215 -18.94381]
 [-16.78523 -16.33951]
 [-13.42847 -13.51924]
 [-11.24815 -11.19161]
 [ -8.88679  -8.96441]
 [ -6.57374  -6.70985]
 [ -4.51873  -4.47419]
 [ -2.44529  -2.51463]
 [  0.        0.     ]]
Your value of beta is: [[-25.42937 -25.58918]
 [-23.32



log-likelihood -59.65907612785483
log-likelihood -58.78006168233148
log-likelihood -58.05638622904577
log-likelihood -58.047573998672526




log-likelihood -58.04756024248874
log-likelihood -58.04756014666195
log-likelihood -58.04756014303575
log-likelihood -58.04756014156317
log-likelihood -58.04756014077336
log-likelihood -58.047560140347095
log-likelihood -58.04756014011701
log-likelihood -58.04756013999282
log-likelihood -58.047560139925785
log-likelihood -58.04756013988958
log-likelihood -58.04756013987006
log-likelihood -58.04756013985951
log-likelihood -58.04756013985383
log-likelihood -58.04756013985076
hmm.A should be [[0.      1.      0.      0.     ]
 [0.14122 0.      0.27099 0.58779]
 [0.20671 0.      0.      0.79329]
 [0.      0.90909 0.09091 0.     ]]
Your implementation has hmm.A to be [[0.      1.      0.      0.     ]
 [0.14122 0.      0.27099 0.58779]
 [0.20671 0.      0.      0.79329]
 [0.      0.90909 0.09091 0.     ]]
hmm.B should be [[0.      0.      0.      0.      0.      0.      0.      1.      0.
  0.     ]
 [0.0625  0.      0.      0.5     0.      0.125   0.125   0.      0.125
  0.0625 ]
 [0.     



log-likelihood -16.777820841440715
log-likelihood -15.77248611608586
log-likelihood -15.772486116083344
log-likelihood -15.772486116083343
log-likelihood -15.772486116083341
log-likelihood -15.772486116083341
log-likelihood -15.772486116083343
log-likelihood -15.772486116083344
log-likelihood -15.772486116083343
After this test case, hmm.A should be the identity matrix [[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
This is your current value of hmm.A:  [[1.      0.      0.     ]
 [0.      0.1047  0.8953 ]
 [0.      0.58468 0.41532]]
After this test case, hmm.B should be some 3 by 3 permutation matrix
This is your current value of hmm.B:  [[1.  0.  0. ]
 [0.  0.5 0.5]
 [0.  0.5 0.5]]


'\nNote: The end_to_end_test is not as robustg due to it using random starts. Try\nrunning the test case a few times to see if you get a good result at least a few\ntimes before deciding that your code is buggy.\n'

## Training

Train a model:

In [None]:
tokenizer = lab_util.Tokenizer()
tokenizer.fit(train_reviews)
train_reviews_tk = tokenizer.tokenize(train_reviews)
print(tokenizer.vocab_size)

hmm = HMM(num_states=10, num_words=tokenizer.vocab_size)
hmm.learn_unsupervised(train_reviews_tk, 10)

Let's look at some of the words associated with each hidden state:

In [16]:
for i in range(hmm.num_states):
    most_probable = np.argsort(hmm.B[i, :])[:10]
    print(f"state {i}")
    for o in most_probable:
        print(tokenizer.token_to_word[o], hmm.B[i, o])
    print()

state 0
states 3.1126533519085174e-08
ramen 5.5676898592441945e-08
75 7.898861623322146e-08
watered 2.656283188552594e-07
chunks 3.72611444393929e-07
sparkling 3.9483849173647634e-07
anyway 4.5119796180180973e-07
soaked 4.795532431975632e-07
ice 5.285933133650525e-07
wont 5.7574604262563e-07

state 1
wide 1.5247667858903364e-07
companies 1.9194562658638927e-07
shows 2.822460146045568e-07
yellow 3.5496644457109357e-07
measure 3.986421068729611e-07
bodied 4.0022421217557886e-07
support 5.618912749824626e-07
sweetness 6.076348488701063e-07
homemade 6.14790060145347e-07
supply 6.807566447895067e-07

state 2
common 2.060077135398386e-08
puck 9.8955626830216e-08
talk 1.6479360901811192e-07
wife 1.805859425492689e-07
flax 2.0091580608788177e-07
york 2.449951020895544e-07
blueberries 3.753501949884209e-07
cheese 4.1513537600984343e-07
apart 4.5944605776258e-07
important 4.958409694512951e-07

state 3
possible 1.3885248976967314e-07
stated 1.8242212805906483e-07
offer 3.364614389774708e-07
lb 3

We can also look at some samples from the model!

In [17]:
for i in range(10):
    print(tokenizer.de_tokenize([hmm.generate(10)]))

['crisp . to sell <unk> baby beans <unk> . ,']
['as stick . of you delicious the but a you']
['water this bad selling <unk> anyone very of it due']
['know of 100 have , <unk> best with blend as']
['able know and cheaper to enjoyed is not single surprised']
["cracker . spent <unk> i've br <unk> to coffee ,"]
['salt but . for review you ones like i peanuts']
["flavors they reviewers it 50 won't <unk> rice <unk> few"]
["it recipe the . br tolerate . . there's sandwich"]
["way <unk> didn't in readily added <unk> find many claim"]


Finally, let's repeat the classification experiment from Parts 1 and 2, using the _vector of expected hidden state counts_ as a sentence representation.

(Warning! results may not be the same as in earlier versions of this experiment.)

In [18]:
def train_model(xs_featurized, ys):
  import sklearn.linear_model
  model = sklearn.linear_model.LogisticRegression()
  model.fit(xs_featurized, ys)
  return model

def eval_model(model, xs_featurized, ys):
  pred_ys = model.predict(xs_featurized)
  print("test accuracy", np.mean(pred_ys == ys))

def training_experiment(name, featurizer, n_train):
    print(f"{name} features, {n_train} examples")
    train_xs = np.array([
        hmm_featurizer(review) 
        for review in tokenizer.tokenize(train_reviews[:n_train])
    ])
    train_ys = train_labels[:n_train]
    test_xs = np.array([
        hmm_featurizer(review)
        for review in tokenizer.tokenize(test_reviews)
    ])
    test_ys = test_labels
    model = train_model(train_xs, train_ys)
    eval_model(model, test_xs, test_ys)
    print()

def hmm_featurizer(review):
    _, _, gamma = hmm.forward_backward(review)
    return gamma.sum(axis=0)

training_experiment("hmm", hmm_featurizer, n_train=100)

hmm features, 100 examples




test accuracy 0.504



## Experiments for Part 3

In [43]:
# (a)

hmm = HMM(num_states=8,num_words=4)

hmm.B = np.array([[1.0, 0.0, 0.0, 0.0],
                  [0.0, 0.0, 1.0, 0.0],
                  [1.0, 0.0, 0.0, 0.0],
                  [0.0, 0.0, 0.0, 1.0],
                  [0.0, 1.0, 0.0, 0.0],
                  [0.0, 0.0, 1.0, 0.0],
                  [0.0, 1.0, 0.0, 0.0],
                  [0.0, 0.0, 0.0, 1.0]])

hmm.A = np.array([[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                  [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                  [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
                  [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                  [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
                  [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
                  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
                  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]])
    
hmm.pi = np.array([0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0])

corpus = np.array([[0,2,0,2,0,2,0,2,0,2,0,2,0], [0,3,0,3,0,3,0,3,0,3,0,3] , [1,2,1,2,1,2,1,2,1,2,1,2] , [1,3,1,3,1,3,1,3,1,3]])
hmm.learn_unsupervised(corpus, 10)

for i in range(10):
    print([hmm.generate(12)])



log-likelihood -5.545177444479562
[[0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3]]
[[1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2]]
[[0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2]]
[[0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2]]
[[0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3]]
[[0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3]]
[[0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2]]
[[1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3]]
[[0, 3, 0, 3, 0, 3, 0, 3, 0, 3, 0, 3]]
[[0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2]]


In [None]:
# (b)

hmm = HMM(num_states=2, num_words=tokenizer.vocab_size)
hmm.learn_unsupervised(train_reviews_tk, 10)


In [344]:
for i in range(hmm.num_states):
    most_probable = np.argsort(hmm.B[i, :])[:10]
    print(f"state {i}")
    for o in most_probable:
        print(tokenizer.token_to_word[o], hmm.B[i, o])
    print()

state 0
brewers 2.1576134035867762e-07
wasted 4.590910697092427e-07
supply 6.81768673954738e-07
gummy 6.894822864827506e-07
starting 7.038081493451297e-07
watchers 9.385700292123929e-07
cons 1.0843458349946028e-06
clam 1.3275130925162067e-06
alive 1.6246034620661128e-06
rica 1.6621196323391283e-06

state 1
specifically 8.823358162344053e-08
york 8.449136389790493e-07
picture 9.550954392705242e-07
classic 1.069369174615933e-06
floor 1.1870732085541305e-06
missing 1.2749401201574024e-06
costa 1.4059376265785708e-06
consistency 1.4399323515847144e-06
bean 1.6671276422830663e-06
mg 1.7301852453922772e-06



In [None]:
hmm = HMM(num_states=10, num_words=tokenizer.vocab_size)
hmm.learn_unsupervised(train_reviews_tk, 10)


In [308]:
for i in range(hmm.num_states):
    most_probable = np.argsort(hmm.B[i, :])[:10]
    print(f"state {i}")
    for o in most_probable:
        print(tokenizer.token_to_word[o], hmm.B[i, o])
    print()

state 0
hold 3.1692767845658185e-07
piece 3.7320842677484414e-07
meals 3.832428174569911e-07
ordering 5.514287899882275e-07
besides 5.851982333238589e-07
decaffeinated 6.869341162627688e-07
carried 7.238168477727419e-07
version 7.879504452803143e-07
therefore 8.578346259394604e-07
okay 1.0260794750811618e-06

state 1
hardly 1.379604538870301e-08
instant 3.6003081527778506e-07
constantly 3.601173470155586e-07
amazon's 4.880752114344526e-07
guy 6.358949992651551e-07
palatable 6.441858922017754e-07
gatorade 6.502888513414543e-07
forever 1.075177184993607e-06
point 1.1667411817019264e-06
unique 1.2256078580851735e-06

state 2
packages 2.2513328451349182e-08
backyard 9.678987402616077e-08
lightly 1.273592440885765e-07
crush 1.7122761581226615e-07
flakes 1.8336624775500517e-07
buyer 2.1448159407158483e-07
careful 2.1747838958950626e-07
substitute 2.7476010196510193e-07
individually 3.7533524336208485e-07
consumed 3.9856736921414855e-07

state 3
hadn't 4.673765526814337e-08
switched 1.6889309

In [None]:
hmm = HMM(num_states=100, num_words=tokenizer.vocab_size)
hmm.learn_unsupervised(train_reviews_tk, 10)

In [310]:
for i in range(hmm.num_states):
    most_probable = np.argsort(hmm.B[i, :])[:10]
    print(f"state {i}")
    for o in most_probable:
        print(tokenizer.token_to_word[o], hmm.B[i, o])
    print()

state 0
concerned 7.834324874966821e-07
http 9.736727369970197e-07
she 1.00838422700543e-06
flax 1.706360580137425e-06
say 1.7292599519737597e-06
breath 2.048424775461733e-06
sad 2.974206666336067e-06
affordable 3.7242045490273987e-06
cleaning 3.738589207538128e-06
meal 4.222284252848407e-06

state 1
mean 1.1005367718001801e-07
forget 2.2774155583861302e-07
major 1.2828608568920328e-06
before 1.3146743601225886e-06
shows 2.1636252340084515e-06
seemed 2.373701986497841e-06
make 3.321781253893971e-06
support 3.5586848923976144e-06
mouth 3.817575094135159e-06
over 4.056026084873231e-06

state 2
similar 2.7096778434621694e-07
perfect 5.443023611312213e-07
seemed 6.168712164311385e-07
although 1.1821268421254969e-06
truffles 1.2413704902493538e-06
treats 1.9551209431559845e-06
able 2.75696963625808e-06
watch 2.9002734310761193e-06
especially 3.3934385338790422e-06
blend 3.415247725107145e-06

state 3
you've 2.5639895246226733e-07
name 5.100975783159069e-07
alternative 8.920324166163526e-07


In [None]:
# (c)

hmm = HMM(num_states=10, num_words=tokenizer.vocab_size)
hmm.learn_unsupervised(train_reviews_tk, 10)

In [None]:
# Your code here!
training_experiment("hmm", hmm_featurizer, n_train=100)

In [19]:
# Your code here!
training_experiment("hmm", hmm_featurizer, n_train=500)

hmm features, 500 examples




test accuracy 0.6



In [20]:
# Your code here!
training_experiment("hmm", hmm_featurizer, n_train=1000)

hmm features, 1000 examples




test accuracy 0.602



In [21]:
# Your code here!
training_experiment("hmm", hmm_featurizer, n_train=2000)

hmm features, 2000 examples




test accuracy 0.63



In [23]:
# Your code here!
training_experiment("hmm", hmm_featurizer, n_train=5000)

hmm features, 5000 examples




test accuracy 0.632

