In [None]:
import sys
import os

# Add the scripts folder to the Python path
sys.path.append(os.path.abspath("../scripts"))  # Adjust the path accordingly

In [None]:
import pandas as pd
import numpy as np
from load_data import *
import pickle

In [None]:
data_file = "../data/brown-universal.txt"
tags_file = "../data/tags-universal.txt"
NLTK_model = "../results/hmm_tagger-NLTK.pkl"
BW_model = "../results/hmm_tagger-BW.pkl"

# Viterbi Algorithm

In [None]:
class Predictor:
    def __init__(self, tags, transitions, emissions, initial):
        self.tags = tags
        self.transitions = transitions
        self.emissions = emissions
        self.initial = initial

    def viterbi(self, sequence):
        """
        Predict the Part of Speech tags for a given sentence using the Viterbi algorithm
        """
        tags = self.tags
        transitions = self.transitions
        emissions = self.emissions
        initial = self.initial

        # viterbi matrix
        # V[t][i] = value of path with the highest probability that accounts for the first t observations
        V = [{}] 
        # path matrix
        # path[t][i] = path w/ highest probability that accounts for first t observations
        path = [{}]

        # i.e. V is the max(), path is the argmax() 
        
        # initialize first step
        for state in tags:
            # handle OOV words w/ small probability
            emission_prob = emissions[state].get(sequence[0], 1e-5)
            V[0][state] = initial[state] * emission_prob
            path[0][state] = [state]
        
        # recursion
        for t in range(1, len(sequence)):
            V.append({})
            path.append({})
            
            for cur_state in tags:
                # handle OOV
                emission_prob = emissions[cur_state].get(sequence[t], 1e-5)
                
                # initialize max, argmax to nothing
                max_prob = float('-inf')
                max_state = None
                
                # get max, argmax of V[t-1][i]*transitions[i][j] over all states i
                for prev_state in tags: # for all states i
                    # smoothing for missing transitions
                    transition_prob = transitions[prev_state].get(cur_state, 1e-5)

                    # V[t-1][i]*transitions[i][j]
                    prob = V[t-1][prev_state] * transition_prob * emission_prob
                    
                    # max, argmax
                    if prob > max_prob:
                        max_prob = prob
                        max_state = prev_state
                
                # V[t][j] = max(V[t-1][i]*transitions[i][j])*emissions[j][observation_t]
                V[t][cur_state] = max_prob
                # path[t][j] = argmax(V[t-1][i]*transitions[i][j])*emissions[j][observation_t]
                path[t][cur_state] = path[t-1][max_state] + [cur_state]
                    

        # termination + backtracking
        T = len(sequence)-1 # T = last time-step

        best_final_state = max(V[T], key=V[T].get)

        best_path = path[T][best_final_state] # path stores completes paths, so just access the last one

        return best_path

# Make Predictions

With NLTK Trained HMM

In [None]:
with open(NLTK_model, 'rb') as pickle_file:
    hmm_tagger_NLTK = pickle.load(pickle_file)

In [None]:
# Extract model parameters
tags = hmm_tagger_NLTK._states  # Set of all possible tags
words = hmm_tagger_NLTK._symbols  # Set of all possible words

# convert probability distributions of HMM to dictionaries
transitions = {}
for prev_state in hmm_tagger_NLTK._transitions:
    transitions[prev_state] = {}
    for next_state in tags:
        transitions[prev_state][next_state] = hmm_tagger_NLTK._transitions[prev_state].prob(next_state)

emissions = {}
for state in tags:
    emissions[state] = {}
    for word in words:
        emissions[state][word] = hmm_tagger_NLTK._outputs[state].prob(word)

initial = {}
for state in tags:
    initial[state] = hmm_tagger_NLTK._priors.prob(state)

In [None]:
NLTK_tagger = Predictor(tags, transitions, emissions, initial)

In [None]:
initial_df = pd.DataFrame([initial], columns=tags)
initial_df

In [None]:
emissions_df = pd.DataFrame.from_dict(emissions)
print(sum(emissions_df.iloc[:, 0]))

In [None]:
transitions_df = pd.DataFrame.from_dict(transitions)
print(sum(transitions_df.iloc[:, 0]))

With HMM trained with own Baum-Welch implementation

In [None]:
with open(BW_model, 'rb') as pickle_file:
    hmm_tagger_BW = pickle.load(pickle_file)

In [None]:
# Extract model parameters
tags = hmm_tagger_BW["states"]  # Set of all possible tags
words = hmm_tagger_BW["vocab"]  # Set of all possible words

# convert probability distributions of HMM to dictionaries
transitions = {}
transition_probs = hmm_tagger_BW["transition_probs"]
for prev_idx, prev_state in enumerate(tags):
    transitions[prev_state] = {}
    for next_idx, next_state in enumerate(tags):
        transitions[prev_state][next_state] = transition_probs[prev_idx, next_idx]

emissions = {}
emission_probs = hmm_tagger_BW["emission_probs"]
for state_idx, state in enumerate(tags):
    emissions[state] = {}
    for word_idx, word in enumerate(words):
        emissions[state][word] = emission_probs[state_idx, word_idx]

initial = {}
initial_probs = hmm_tagger_BW["initial_probs"]
for state_idx, state in enumerate(tags):
    initial[state] = initial_probs[state_idx]

In [None]:
BW_tagger = Predictor(tags, transitions, emissions, initial)

In [None]:
initial_df = pd.DataFrame([initial], columns=tags)
initial_df

In [None]:
emissions_df = pd.DataFrame.from_dict(emissions)
print(sum(emissions_df.iloc[:, 0]))

In [None]:
transitions_df = pd.DataFrame.from_dict(transitions)
print(sum(transitions_df.iloc[0]))

# Evaluate

Import predictions from file

In [None]:
def compute_accuracy(actuals, targets):
    """Compute accuracy of guesses comparing to target data.
    actual: list of guesses from the model ex. [['VERB', 'NOUN'], ['DET']]
    target: list values from the test/validation set
    """
    if len(actuals) != len(targets):
        return -1 # the number of actual values should match number of target values
    correct_count = 0
    total_tags = 0
    for actual_tags, target_tags in zip(actuals, targets):
        total_tags += len(actual_tags)
        if len(actual_tags) != len(target_tags):
            return -1 # the number of actual values should match number of target values
        for actual_value, target_value in zip(actual_tags, target_tags):
            if actual_value == target_value:
                correct_count += 1
    
    return correct_count/total_tags

Data for evaluation

In [None]:
train, test = load_brown_data(data_file, split=0.8)

test_sample = test # can split test to test on a smaller sample
test_sample = [[token.get_word() for token in sentence] for sentence in test_sample]

Evaluate NLTK Trained model

In [None]:
predictions = [NLTK_tagger.viterbi(sequence) for sequence in test_sample]
targets = [[token.get_pos() for token in sentence] for sentence in test]

In [None]:
print(compute_accuracy(predictions, targets))

Evaluate HMM trained with own Baum-Welch algorithm

In [None]:
predictions = [BW_tagger.viterbi(sequence) for sequence in test_sample]
targets = [[token.get_pos() for token in sentence] for sentence in test]

In [None]:
print(compute_accuracy(predictions, targets))