In [None]:
import sys
import os

# Add the scripts folder to the Python path
sys.path.append(os.path.abspath("../scripts"))  # Adjust the path accordingly

In [None]:
import pandas as pd
import numpy as np
from load_data import *
from preprocessing import *

In [None]:
data_file = "../data/brown-universal.txt"
tags_file = "../data/tags-universal.txt"
predictions_file = "../results/pred-viterbi-tags.txt"

# Load Data from File

In [None]:
train, test = load_tagged_sentences(data_file, split=0.8)
tags = load_tags(tags_file)

In [None]:
print("There are {} sentences in the training set.".format(len(train)))
print("There are {} sentences in the testing set.".format(len(test)))

In [None]:
# partition train so only a few of the samples are used for the initial probabilities
train_sample = train

# Train HMM with NLTK

In [None]:
! pip install -U nltk

In [None]:
import nltk

def laplace_smoothing(freq_dist, bins):
    return nltk.LaplaceProbDist(freq_dist, bins)

def trainer(data):
    trainer = nltk.tag.hmm.HiddenMarkovModelTrainer()
    hmm_tagger = trainer.train_supervised(data, estimator=laplace_smoothing)
    return hmm_tagger

data = [[(token.get_word(), token.get_pos()) for token in sentence] for sentence in train_sample]
hmm_tagger = trainer(data)

In [None]:
# save the hmm so we don't have to train every time
import pickle

# Save the trained HMM model to a file
with open("../results/hmm_tagger.pkl", "wb") as f:
    pickle.dump(hmm_tagger, f)

# Predict using trained HMM

In [None]:
# TODO IMPORT FROM SAVED FILE FOR LATER USES

Get parameters of trained HMM

In [None]:
# Extract model parameters
tags = hmm_tagger._states  # Set of all possible tags
words = hmm_tagger._symbols  # Set of all possible words

# convert probability distributions of HMM to dictionaries
transitions = {}
for prev_state in hmm_tagger._transitions:
    transitions[prev_state] = {}
    for next_state in tags:
        transitions[prev_state][next_state] = hmm_tagger._transitions[prev_state].prob(next_state)

emissions = {}
for state in tags:
    emissions[state] = {}
    for word in words:
        emissions[state][word] = hmm_tagger._outputs[state].prob(word)

initial = {}
for state in tags:
    initial[state] = hmm_tagger._priors.prob(state)

## Viterbi Algorithm

In [None]:
class Predictor:
    def __init__(self, tags, transitions, emissions, initial):
        self.tags = tags
        self.transitions = transitions
        self.emissions = emissions
        self.initial = initial

    def viterbi(self, sequence):
        """
        Predict the Part of Speech tags for a given sentence using the Viterbi algorithm
        """
        tags = self.tags
        transitions = self.transitions
        emissions = self.emissions
        initial = self.initial

        # viterbi matrix
        # V[t][i] = value of path with the highest probability that accounts for the first t observations
        V = [{}] 
        # path matrix
        # path[t][i] = path w/ highest probability that accounts for first t observations
        path = [{}]

        # i.e. V is the max(), path is the argmax() 
        
        # initialize first step
        for state in tags:
            # handle OOV words w/ small probability
            emission_prob = emissions[state].get(sequence[0], 1e-5)
            V[0][state] = initial[state] * emission_prob
            path[0][state] = [state]
        
        # recursion
        for t in range(1, len(sequence)):
            V.append({})
            path.append({})
            
            for cur_state in tags:
                # handle OOV
                emission_prob = emissions[cur_state].get(sequence[t], 1e-5)
                
                # initialize max, argmax to nothing
                max_prob = float('-inf')
                max_state = None
                
                # get max, argmax of V[t-1][i]*transitions[i][j] over all states i
                for prev_state in tags: # for all states i
                    # smoothing for missing transitions
                    transition_prob = transitions[prev_state].get(cur_state, 1e-5)

                    # V[t-1][i]*transitions[i][j]
                    prob = V[t-1][prev_state] * transition_prob * emission_prob
                    
                    # max, argmax
                    if prob > max_prob:
                        max_prob = prob
                        max_state = prev_state
                
                # V[t][j] = max(V[t-1][i]*transitions[i][j])*emissions[j][observation_t]
                V[t][cur_state] = max_prob
                # path[t][j] = argmax(V[t-1][i]*transitions[i][j])*emissions[j][observation_t]
                path[t][cur_state] = path[t-1][max_state] + [cur_state]
                    

        # termination + backtracking
        T = len(sequence)-1 # T = last time-step

        best_final_state = max(V[T], key=V[T].get)

        best_path = path[T][best_final_state] # path stores completes paths, so just access the last one

        return list(zip(sequence, best_path))

Code for saving predictions into a file

In [80]:
def format_tagged_sentence(tagged_sentence):
    return " ".join([f"{word} ({tag})" for word, tag in tagged_sentence])

In [81]:
test_sample = test 
test_sample = [[token.get_word() for token in sentence] for sentence in test_sample]
tagger = Predictor(tags, transitions, emissions, initial)

with open(predictions_file, 'w') as f:
    for sentence in test_sample:
        # Tag the sentence
        tagged_sentence = tagger.viterbi(sentence)
        
        # Format and write to file
        formatted_sentence = format_tagged_sentence(tagged_sentence)
        f.write(formatted_sentence + '\n')