In [1]:
# You should not modify code in this cell
import sys
import nltk
from nltk.corpus import treebank

# Get numsents POS-tagged sentences from the treebank corpus
def get_pos_data(numsents):

    # Extract required number of sentences
    sentences = treebank.tagged_sents()[:numsents]

    # Initialize
    sequences = []
    symbols = set()
    tag_set = set()
    
    # Go over each extracted sentence ...
    for sentence in sentences:
        for i in range(len(sentence)):
            word, tag = sentence[i]
            word = word.lower()  # normalize case
            symbols.add(word)    # add this word
            tag_set.add(tag)
            sentence[i] = (word, tag)  # store tagged token
        sequences.append(sentence)

    # Return sequences, the list of tags and all the words that we saw
    return sequences, list(tag_set), list(symbols)

# Train the transition and emission probabilities
def train():
    print('Training HMM...')

    # Use the first 5000 sentences from treebank corpus
    labelled_sequences, states, symbols = get_pos_data(5000)
    
    # Define the estimator to be used for probability computation
    estimator = lambda fd, bins: nltk.LidstoneProbDist(fd, 0.1, bins)
    
    # count occurences of starting states, transitions out of each state
    # and output symbols observed in each state
    freq_starts = nltk.FreqDist()
    freq_transitions = nltk.ConditionalFreqDist()
    freq_emissions = nltk.ConditionalFreqDist()
    for sequence in labelled_sequences:
        lasts = None
        for token in sequence:
            state = token[1]
            symbol = token[0]
            if lasts == None:
                freq_starts[state] += 1
            else:
                freq_transitions[lasts][state] += 1
            freq_emissions[state][symbol] += 1
            lasts = state

            # update the state and symbol lists
            if state not in states:
                states.append(state)
            if symbol not in symbols:
                symbols.append(symbol)

    # create probability distributions (with smoothing)
    N = len(states)
    starts = estimator(freq_starts, N)
    transitions = nltk.ConditionalProbDist(freq_transitions, estimator, N)
    emissions = nltk.ConditionalProbDist(freq_emissions, estimator, len(symbols))
                               
    # Return the transition and emissions probabilities along with 
    # the list of all the states and output symbols
    return starts, transitions, emissions, states, symbols

In [2]:
from numpy import zeros, array, float32, int16, argmax
from math import log, exp

# call the train function
priors, transitions, emissions, states, symbols = train()
# suggestion: inspect these five variables to get a sense of the data and data structures

Training HMM...


In [3]:
def decode(states, symbols):
        # VITERBI DECODING
        T = len(symbols) # sentence
        N = len(states)
        V = zeros((T, N), float32)
        B = {}


        for t in range(T):
            symbol = symbols[t]
            
            #initialization step
            if t == 0:
                for i in xrange(N):
                    state = self.states[i]
                    V[t, i] = self.priors.prob(state) * \
                              self.emissions[state].prob(symbol)
                    B[t, state] = None
            else:
                #recursion step
                for j in xrange(N):
                    sj = self.states[j]
                    best = None
                    for i in range(N):
                        si = self.states[i]
                        va = V[t-1, i] * self.transitions[si].prob(sj)
                        if not best or va > best[0]:
                            best = (va, si)
                    #termination steps
                    V[t, j] = best[0] * self.emissions[sj].prob(symbol)
                    B[t, sj] = best[1]

        best = None
        for i in xrange(N):
            val = V[T-1, i]
            if not best or val > best[0]:
                best = (val, self.states[i])


        current = best[1]
        sequence = [current]
        for t in xrange(T-1, 0, -1):
            last = B[t, current]
            sequence.append(last)
            current = last

        sequence.reverse()
        return sequence

In [5]:
list= decode(states, 'if it were a hollywood movie , you never believe it .')
list

AttributeError: 'list' object has no attribute 'states'