## Part-of-Speech Tagger

In [14]:
import pandas as pd
import numpy as np
import os
from numpy import genfromtxt

## create transition properties

In [16]:
dropbox = os.path.expanduser('~/Dropbox/')
tran = genfromtxt(dropbox + 'NLP Readings/hw 1/test_POS_book_example/test_transitions.csv', delimiter=',')

In [17]:
tran

array([[  2.76700000e-01,   6.00000000e-04,   3.10000000e-03,
          4.53000000e-02,   4.49000000e-02,   5.10000000e-02,
          2.02600000e-01],
       [  3.77700000e-01,   1.10000000e-02,   9.00000000e-04,
          8.40000000e-03,   5.84000000e-02,   9.00000000e-03,
          2.50000000e-03],
       [  8.00000000e-04,   2.00000000e-04,   7.96800000e-01,
          5.00000000e-04,   8.00000000e-04,   1.69800000e-01,
          4.10000000e-03],
       [  3.22000000e-02,   5.00000000e-04,   5.00000000e-03,
          8.37000000e-02,   6.15000000e-02,   5.14000000e-02,
          2.23100000e-01],
       [  3.66000000e-02,   4.00000000e-04,   1.00000000e-04,
          7.33000000e-02,   4.50900000e-01,   3.60000000e-03,
          3.60000000e-03],
       [  9.60000000e-03,   1.76000000e-02,   1.40000000e-03,
          8.60000000e-03,   1.21600000e-01,   1.77000000e-02,
          6.80000000e-03],
       [  6.80000000e-03,   1.02000000e-02,   1.01100000e-01,
          1.01200000e-01,   1.20

## Create observations likelihood. Include all, and then index what you want

In [44]:
observations = genfromtxt(dropbox + 'NLP Readings/hw 1/test_POS_book_example/test_observations.csv', delimiter=',')

In [45]:
observations  = observations.transpose()
observations

array([[  3.20000000e-05,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00],
       [  0.00000000e+00,   3.08431000e-01,   2.80000000e-05,
          0.00000000e+00,   2.00000000e-04,   0.00000000e+00,
          0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   6.72000000e-04,
          3.40000000e-04,   2.23000000e-04,   1.04460000e-02,
          0.00000000e+00],
       [  4.80000000e-05,   0.00000000e+00,   0.00000000e+00,
          9.70000000e-05,   6.00000000e-06,   0.00000000e+00,
          5.06099000e-01],
       [  0.00000000e+00,   0.00000000e+00,   2.80000000e-05,
          0.00000000e+00,   2.33700000e-03,   0.00000000e+00,
          0.00000000e+00]])

## create empty dataframe that is two rows, and three columns (one column per observation)

In [47]:
df = np.zeros((observations.transpose().shape))

In [48]:
observations.shape

(5, 7)

In [49]:
df

array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.]])

In [50]:
observations.shape

(5, 7)

In [51]:
unobserved

['NNP', 'MD', 'VB', 'JJ', 'NN', 'RB', 'DT']

In [52]:
unobserved = ['NNP', 'MD','VB','JJ','NN','RB','DT']
events = [1, 2, 3, 4, 5]

In [74]:
probs = []

# best_ind should be the same size as df
# Each column stores the most likely states for time t
# Each row corresponds to a state (HOT, COLD) and stores the most likely previous state (HOT=0, COLD=1)
best_ind = np.zeros((observations.transpose().shape))

# begin in the first column by setting the Viterbi value in each
# cell to the product of the transition probability and
# the observation probability
for i in range(0,len(unobserved)):
    df [i,0] = tran[0,i] * observations[events[0]-1,i]
    index = np.argmax(df [i,0])
    best_ind[i][0] = index


# move on column by column (vocabulary by vocabulary)
for t in range(1
               ,len(events)):    
    for j in range(0,len(unobserved)):
        for k in range (0,len(unobserved)):
            # compute the probability of moving into each state
            # previous Viterbi path probability from previous step (df [j,t-1])
            prev = df[k, t-1]
                
            # the state observation likelihood (observations [k,t])
            event = events[t]-1 # minus 1 because the index into observations should start at 0
            obs_prob = observations[event, j]
            
            # the transition probability
            tran_prob = tran[k+1, j] # k+1 because the first row of tran is the start probabilities
            
            prob = obs_prob * prev * tran_prob
            probs.append(prob)
            
        # select the largest probability for moving into each state to be stored
        # in table -- max sure to turn zeros into nan
        
        df[j,t] = max(probs)

        # get the index of which state had the larger probabilty: zero probabily
        index = np.argmax(probs)
        best_ind[j][t] = index
        probs = []

# termination step:
T = len(events) - 1
final_prob = []
for k in range(0,len(unobserved)):
    final_prob.append(df[k, T] * tran[k+1, T])

# backtrack:
state_path = []
back = np.argmax(final_prob) # The most likely state for our final observation
for i in reversed(range(len(best_ind[1]))):
    # Add the state corresponding to back to the beginning of state_path list:
    state_path.insert(0, unobserved[back])
    
    # backtrack: get the most likely state for the previous
    # step based on this one:
    back = int(best_ind[back, i])

print("Path of most likely states:")
print(" -> ".join(state_path))
print()
print("Not that this is NOT (necessarily) the same as taking the max of each individual state column:")
test = df.transpose()
best_unobserved = []
for i in range(1,6):
    maximum = np.argmax(test[i-1:i])
    best_unobserved.append(unobserved[maximum])
print(" -> ".join(best_unobserved))


Path of most likely states:
NNP -> MD -> VB -> DT -> NN

Not that this is NOT (necessarily) the same as taking the max of each individual state column:
NNP -> MD -> RB -> DT -> NN
