The purpose of this notebook is to experiment with part of speech tagging using a Hidden Markov Model and the Viterbi algoritm. We train the algorithm on the Brown Corpus.

In [84]:
import numpy as np
import pandas as pd
import nltk
import math

nltk.download('brown')
from nltk.corpus import brown

# Accessing the tagged sentences
nltk.download('universal_tagset')

brown_tagged_sents = brown.tagged_sents(tagset='universal')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\keega\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\keega\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


# Brown corpus


In [85]:
# prompt: From brown_tagged_sents get parts of speech tags and form a set.

parts_of_speech = set()
for sentence in brown_tagged_sents:
  for word, tag in sentence:
    parts_of_speech.add(tag)
print(parts_of_speech)

{'X', 'PRON', 'CONJ', '.', 'ADV', 'VERB', 'NUM', 'DET', 'ADP', 'NOUN', 'ADJ', 'PRT'}


See [universal POS tags readme](https://github.com/slavpetrov/universal-pos-tags/blob/master/README.md)

In [86]:
brown_tagged_sents = brown.tagged_sents(tagset='universal')
brown_tagged_sents = [[('START', 'START')] + sentence + [('END', 'END')] for sentence in brown_tagged_sents]
# We adding to each sentence a start and end. We can think of 'Start' as a part of speech and also as a word.

In [87]:
brown_word_tags = [word_tag for sentence in brown_tagged_sents for word_tag in sentence]
tag_word_pairs = [(tag, word) for word, tag in brown_word_tags]


In [88]:
#Getting the conditional frequency distribution for the words which are tagged
cfd_word_given_tag=nltk.ConditionalFreqDist(tag_word_pairs)
# Create a ConditionalProbDist for emission probabilities
cpd_emission = nltk.ConditionalProbDist(cfd_word_given_tag, nltk.MLEProbDist)


In [89]:
# Extract the sequence of tags from the original list of (word, tag) pairs
tags = [tag for word, tag in brown_word_tags]

# Create pairs of consecutive tags
tag_pairs = [(tags[i], tags[i+1]) for i in range(len(tags)-1)]


In [90]:
# Create a CFD from the pairs of consecutive tags
cfd_tag_transitions = nltk.ConditionalFreqDist(tag_pairs)

# Convert the CFD into a CPD for transition probabilities
cpd_tag_transitions = nltk.ConditionalProbDist(cfd_tag_transitions, nltk.MLEProbDist) #NEED


# Viterbi Algorithm

In [91]:

def viterbi(observed_words, cpd_tag_transitions, cpd_emission, states):
    # Initialize the dynamic programming table to store probabilities
    V = [{}]
    path = {}

    # Initialize base case (t == 0)
    for state in states:
        # Use log probabilities here
        V[0][state] = math.log(cpd_tag_transitions['START'].prob(state)+1e-100) + math.log(cpd_emission[state].prob(observed_words[0])+1e-100)
        path[state] = [state]

    # Run Viterbi for t > 0
    for t in range(1, len(observed_words)):
        V.append({})
        newpath = {}

        for cur_state in states:
            # Check if state is 'START' or 'END'
            if cur_state in ['START', 'END']:
                continue
                
            # Select the state transition path with the maximum probability
            (prob, state) = max(
                (V[t-1][prev_state] + math.log(cpd_tag_transitions[prev_state].prob(cur_state)+1e-100) + math.log(cpd_emission[cur_state].prob(observed_words[t])+1e-100), prev_state)
                for prev_state in states if prev_state not in ['START', 'END']
            )

            V[t][cur_state] = prob
            newpath[cur_state] = path[state] + [cur_state]

        # Don't need to remember the old paths
        path = newpath

    # Add a final step for transition to 'END' state
    prob, state = max((V[len(observed_words) - 1][state] + math.log(cpd_tag_transitions[state].prob('END')+1e-100), state) for state in states if state not in ['START', 'END'])
    return (prob, path[state])


In [92]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize
brown_words = set(brown.words())
sentence1 = ["the", "cat", "jumped","over","it"]
sentence2 = ["she", "quickly", "ran","towards","the","old","house","near","the","river","bank"]
all_in_corpus = all(word in brown_words for word in sentence1)
print(f"First Sentance all in brown corpus: {all_in_corpus}")
all_in_corpus = all(word in brown_words for word in sentence2)
print(f"Second Sentance all in brown corpus: {all_in_corpus}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\keega\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


First Sentance all in brown corpus: True
Second Sentance all in brown corpus: True


In [93]:
states = ['NOUN', 'VERB', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', 'START', 'END'] 
(prob, sequence) = viterbi(sentence1, cpd_tag_transitions, cpd_emission, states)
print(f"Probability of the best tag sequence: {prob}")
print(f"Best tag sequence: {sequence}")
print("-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-")
(prob, sequence) = viterbi(sentence2, cpd_tag_transitions, cpd_emission, states)
print(f"Probability of the best tag sequence: {prob}")
print(f"Best tag sequence: {sequence}")

Probability of the best tag sequence: -43.43242339319502
Best tag sequence: ['DET', 'NOUN', 'VERB', 'ADP', 'PRON']
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
Probability of the best tag sequence: -82.27443959781921
Best tag sequence: ['PRON', 'ADV', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN']
