This file should look very similar to notebooks 1 and 2

Main idea: Train with NLTK to get good initial probabilities for EM, fine-tune on data with EM

In [None]:
import sys
import os

# Add the scripts folder to the Python path
sys.path.append(os.path.abspath("../scripts"))  # Adjust the path accordingly

In [None]:
! pip install -U nltk

In [None]:
import pandas as pd
import numpy as np
import pickle
from load_data import *
from preprocessing import *
from baum_welch import *
import nltk
import matplotlib.pyplot as plt

In [None]:
data_file = "../data/brown-universal.txt"
tags_file = "../data/tags-universal.txt"
model_file = "../results/hmm_tagger-SS.pkl"
NLTK_model = "../results/hmm_tagger-NLTK.pkl"

# Load Data From File

In [None]:
train, test = load_brown_data(data_file, split=0.8)
tags = load_tags(tags_file)

In [None]:
print("There are {} sentences in the training set.".format(len(train)))
print("There are {} sentences in the testing set.".format(len(test)))

In [None]:
# partition to train supervised HMM
sup_train_sample = train[:10000]

In [None]:
# partition to train unsupervised HMM (should be a superset of the previous)
unsup_train_sample = train[10000:20000]

In [None]:
words = []
unsup_train_sentences = []
for sentence in unsup_train_sample:
    train_sentence = []
    for token in sentence:
        word = token.get_word()
        if word == '``' or word == "''":
            word = '"'
        words.append(word)
        train_sentence.append(word)
    unsup_train_sentences.append(train_sentence)

In [None]:
words_lookup = {word: i for i, word in enumerate(words)}
tags_lookup = {tag: i for i, tag in enumerate(tags)}

# Semi-Supervised Training

## 1 Train HMM with NLTK

In [None]:
with open(NLTK_model, 'rb') as pickle_file:
    hmm_tagger_NLTK = pickle.load(pickle_file)

### Extract Features of NLTK-trained model

In [None]:
# Extract model parameters
tags_nltk = hmm_tagger_NLTK._states  # Set of all possible tags
words_nltk = hmm_tagger_NLTK._symbols  # Set of all possible words

# convert probability distributions of HMM to dictionaries
transitions_nltk = np.zeros((len(tags), len(tags)))
for prev_state in hmm_tagger_NLTK._transitions:
    i = tags_lookup[prev_state]
    for next_state in tags_nltk:
        j = tags_lookup[next_state]
        transitions_nltk[i][j] = hmm_tagger_NLTK._transitions[prev_state].prob(next_state)


emissions_nltk = np.zeros((len(tags), len(words)))
for state in tags_nltk:
    i = tags_lookup[state]
    rand_emission_prob = hmm_tagger_NLTK._outputs[state].prob(hmm_tagger_NLTK._outputs[state].generate())
    for word in words:
        k = words_lookup[word]
        if word in words_nltk:
            emissions_nltk[i][k] = hmm_tagger_NLTK._outputs[state].prob(word)
        else: 
            emissions_nltk[i][k] = rand_emission_prob # unseen word just as likely as some random emission

# normalize emission probabilities
row_sums = emissions_nltk.sum(axis=1)
emissions = emissions_nltk / row_sums[:, np.newaxis]

initial_nltk = np.zeros(len(tags))
for state in tags_nltk:
    i = tags_lookup[state]
    initial_nltk[i] = hmm_tagger_NLTK._priors.prob(state)

Verify that the initial probabilities are sound

In [None]:
trans_row_sums = transitions_nltk.sum(axis=1)
emissions_row_sums = emissions_nltk.sum(axis=1)
initials_sum = sum(initial_nltk)
print(trans_row_sums)
print(emissions_row_sums)
print(initials_sum)

## 2 Train HMM with Baum-welch

In [None]:
bw_tagger_500 = BaumWelch(tags, words)

In [None]:
bw_tagger_500.initialize_probabilities(transitions_nltk, emissions_nltk, initial_nltk, log=False)

In [None]:
num_samples = len(unsup_train_sentences)
batch_size = 500
batch_start = 0
batch_end = batch_size
done = False
batch_logprobs = []
while not done:
    (bw_tagger_500, logprobs) = bw_tagger_500.train_em(unsup_train_sentences[batch_start:batch_end], max_iterations=20)
    batch_logprobs.append(logprobs)
    batch_start = batch_end
    if batch_start >= num_samples:
        done = True
    else:
        batch_end = min(batch_start + batch_size, num_samples)
    if batch_end == num_samples:
        done = True

In [None]:
bw_tagger_500.save_hmm(filename="../results/hmm_tagger-BW-500.pkl")

In [None]:
plt.figure(figsize=(10, 6))
for i, arr in enumerate(batch_logprobs):
    inverted_values = [-1 * val for val in arr]
    plt.plot(inverted_values, label=f'Batch {i+1}')

# add labels and title
plt.xlabel('Iteration')
plt.ylabel('Negative Log Probability')
plt.title('Log Probability by Iteration by Batch')

# add legend to distinguish between lines
plt.legend()

# add grid for better readability
plt.grid(True, linestyle='--', alpha=0.7)

# show the plot
plt.tight_layout()
plt.show()

In [None]:
bw_tagger_1000 = BaumWelch(tags, words)

In [None]:
bw_tagger_1000.initialize_probabilities(transitions_nltk, emissions_nltk, initial_nltk, log=False)

In [None]:
num_samples = len(unsup_train_sentences)
batch_size = 100
batch_start = 0
batch_end = batch_size
done = False
batch_logprobs = []
while not done:
    (bw_tagger_1000, logprobs) = bw_tagger_1000.train_em(unsup_train_sentences[batch_start:batch_end], max_iterations=20)
    batch_logprobs.append(logprobs)
    batch_start = batch_end
    if batch_start >= num_samples:
        done = True
    else:
        batch_end = min(batch_start + batch_size, num_samples)
    if batch_end == num_samples:
        done = True

In [None]:
bw_tagger_1000.save_hmm(filename="../results/hmm_tagger-BW-500.pkl")

In [None]:
plt.figure(figsize=(10, 6))
for i, arr in enumerate(batch_logprobs):
    inverted_values = [-1 * val for val in arr]
    plt.plot(inverted_values, label=f'Batch {i+1}')

# add labels and title
plt.xlabel('Iteration')
plt.ylabel('Negative Log Probability')
plt.title('Log Probability by Iteration by Batch')

# add legend to distinguish between lines
plt.legend()

# add grid for better readability
plt.grid(True, linestyle='--', alpha=0.7)

# show the plot
plt.tight_layout()
plt.show()