In [1]:
# You should not modify code in this cell
import sys
import nltk
from nltk.corpus import treebank

# Get numsents POS-tagged sentences from the treebank corpus
def get_pos_data(numsents):

    # Extract required number of sentences
    sentences = treebank.tagged_sents()[:numsents]

    # Initialize
    sequences = []
    symbols = set()
    tag_set = set()
    
    # Go over each extracted sentence ...
    for sentence in sentences:
        for i in range(len(sentence)):
            word, tag = sentence[i]
            word = word.lower()  # normalize case
            symbols.add(word)    # add this word
            tag_set.add(tag)
            sentence[i] = (word, tag)  # store tagged token
        sequences.append(sentence)

    # Return sequences, the list of tags and all the words that we saw
    return sequences, list(tag_set), list(symbols)

# Train the transition and emission probabilities
def train():
    print('Training HMM...')

    # Use the first 5000 sentences from treebank corpus
    labelled_sequences, states, symbols = get_pos_data(5000)
    
    # Define the estimator to be used for probability computation
    estimator = lambda fd, bins: nltk.LidstoneProbDist(fd, 0.1, bins)
    
    # count occurences of starting states, transitions out of each state
    # and output symbols observed in each state
    freq_starts = nltk.FreqDist()
    freq_transitions = nltk.ConditionalFreqDist()
    freq_emissions = nltk.ConditionalFreqDist()
    for sequence in labelled_sequences:
        lasts = None
        for token in sequence:
            state = token[1]
            symbol = token[0]
            if lasts == None:
                freq_starts[state] += 1
            else:
                freq_transitions[lasts][state] += 1
            freq_emissions[state][symbol] += 1
            lasts = state

            # update the state and symbol lists
            if state not in states:
                states.append(state)
            if symbol not in symbols:
                symbols.append(symbol)

    # create probability distributions (with smoothing)
    N = len(states)
    starts = estimator(freq_starts, N)
    transitions = nltk.ConditionalProbDist(freq_transitions, estimator, N)
    emissions = nltk.ConditionalProbDist(freq_emissions, estimator, len(symbols))
                               
    # Return the transition and emissions probabilities along with 
    # the list of all the states and output symbols
    return starts, transitions, emissions, states, symbols

In [2]:
from numpy import zeros, array, float32, int16, argmax
from math import log, exp

# call the train function
priors, transitions, emissions, states, symbols = train()
# suggestion: inspect these five variables to get a sense of the data and data structures



Training HMM...


In [3]:
states

['NNP',
 '-RRB-',
 'RBR',
 'DT',
 'NNS',
 'RP',
 'SYM',
 'WRB',
 'POS',
 '$',
 'LS',
 'VBP',
 'VBZ',
 "''",
 'WP',
 'PRP',
 'WDT',
 'EX',
 'CC',
 '.',
 'PDT',
 '-NONE-',
 'RB',
 'JJS',
 'VBN',
 'JJ',
 'MD',
 'VBD',
 '#',
 'VBG',
 'JJR',
 'UH',
 'RBS',
 ',',
 ':',
 'TO',
 'WP$',
 'CD',
 'NN',
 'PRP$',
 'FW',
 'VB',
 'IN',
 '``',
 '-LRB-',
 'NNPS']

In [19]:
for s in states:
    print(priors.logprob(s))

-2.339746459930586
-15.25805069363052
-10.303854383243644
-2.1141892117626604
-4.419634617833182
-15.25805069363052
-15.25805069363052
-7.286507139679747
-15.25805069363052
-9.585625351659024
-9.108303574125836
-15.25805069363052
-8.750256053431823
-11.798619074993221
-8.118499341231725
-3.998895924763679
-10.86573327085176
-7.840198178744622
-4.284353327325167
-15.25805069363052
-10.303854383243644
-5.576812281852715
-4.4840873251969615
-9.327313356067632
-9.108303574125836
-4.775242736903851
-11.798619074993221
-11.798619074993221
-15.25805069363052
-7.840198178744622
-8.339187456355925
-11.798619074993221
-10.86573327085176
-15.25805069363052
-8.463634827280414
-9.585625351659024
-15.25805069363052
-6.887363286823302
-4.492350205979418
-7.073175350722234
-15.25805069363052
-10.303854383243644
-2.955697367199316
-3.726181918452965
-9.108303574125836
-8.599839210878724


In [25]:
x= 0
if x==1:
    print('ass')

In [38]:
# write your viterbi code here
import re
import numpy as np
def decode(sentences):
    wordlist= re.findall(r"[\w']+|[.,!?;]", sentences)
    matrix= [[0 for x in range(len(sentences))] for y in range(len(states))]
    matrix=np.array(matrix, dtype=int)
    backtracker= [[0 for x in range(len(sentences))] for y in range(len(states))]
    backtracker=np.array(backtracker, dtype=int)
    i=0
    for s in states:
        matrix[i, 0]= priors.logprob(s) + emissions[s].logprob(wordlist[0])
        backtracker= i
        i+=1
    t=1
    for w in wordlist[1:]:
        c=0
        for s1 in states:
            j=0
            for s0 in states: 
                #a= matrix.max(axis=0)[t-1]
                val= matrix[j, 0] + transitions[s0].logprob(s1) + emissions[s1].logprob(w)
                if matrix[c, t] == 0:
                    matrix[c, t]= val
                if val > matrix[c, t]:
                    matrix[c, t]= val
                    backtracker[c, t]= j
                j+=1
            c+=1
        t+=1
    backL= []
    w= len(wordlist)
    b= len(states)
    
    maxT= -10000
    val=-10000
    for r in range(b):
        if maxT< matrix[r, w-1]:
            maxT= matrix[r, w-1]
            #print(maxT)
            val= r
    
    #a= matrix.max(axis=0)[wordlist[w-1]]
    #print(val)
    a= val
    for d in range((len(wordlist)-1), 0, -1):
        backL.insert(0, states[a])
        a= matrix[a, d]
    print(matrix)
    return backL
    
    


In [39]:
list= decode('if it were a hollywood movie , you never believe it .')
list

TypeError: 'int' object does not support item assignment

In [40]:
# open test-sentences
testSentenceFile = open('test-sentences.txt')
tagViterbi(testSentenceFile) # you need to define a tagViterbi function

NameError: name 'tagViterbi' is not defined

In [None]:
# eventually, open the hw-sentences
hwSentenceFile = open('hw-sentences.txt')
tagViterbi(hwSentenceFile) 