In [1]:
import sys
import os

# Add the scripts folder to the Python path
sys.path.append(os.path.abspath("../scripts"))  # Adjust the path accordingly

In [2]:
import pandas as pd
import numpy as np
import pickle
from load_data import *
from viterbi import *
from preprocessing import *
import matplotlib.pyplot as plt
from collections import defaultdict
from factored_hmm import FactoredHMM

In [3]:
data_file = "../data/brown-universal.txt"
tags_file = "../data/tags-universal.txt"
model_file = "../results/hmm_tagger-SS.pkl"
NLTK_model = "../results/hmm_tagger-NLTK.pkl"

In [4]:
train, test = load_brown_data(data_file, split=0.8)
tags = load_tags(tags_file)

In [5]:
print("There are {} sentences in the training set.".format(len(train)))
print("There are {} sentences in the testing set.".format(len(test)))

There are 45872 sentences in the training set.
There are 11468 sentences in the testing set.


In [6]:
# partition to train unsupervised HMM (should be a superset of the previous)
train = train
test = test[:100]

In [7]:
sup_train = train[:10000]
unsup_train = train[10000:20000]

In [8]:
words = []
for sentence in train:
    for token in sentence:
        word = token.get_word()
        # if word == '``' or word == "''":
        #     word = '"'
        words.append(word)

In [9]:
tag_to_idx = {tag: i for i, tag in enumerate(tags)}
idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}

In [10]:
# Convert string tags to integers
int_training_data = []
for sentence in sup_train:
    int_sentence = [(token.get_word(), tag_to_idx[token.get_pos()]) for token in sentence]
    int_training_data.append(int_sentence)

int_test_data = []
for sentence in test:
    int_sentence = [(token.get_word(), tag_to_idx[token.get_pos()]) for token in sentence]
    int_test_data.append(int_sentence)

In [11]:
sequences = []
for sentence in unsup_train:
    s = [token.get_word() for token in sentence]
    sequences.append(s)

In [12]:
# create and train the factored HMM
num_pos_tags = len(tag_to_idx)
states_per_tag = 3  # Using 3 states per POS tag

# initialize the model
hmm = FactoredHMM(num_pos_tags, states_per_tag, words)

In [13]:
hmm.initialize_from_tagged_data(int_training_data)

In [14]:
num_samples = len(sequences)
batch_size = 100
batch_start = 0
batch_end = batch_size
done = False
batch_logprobs = []
while not done:
    (hmm, logprobs) = hmm.train_em(sequences[batch_start:batch_end], max_iterations=100)
    batch_logprobs.append(logprobs)
    batch_start = batch_end
    if batch_start >= num_samples:
        done = True
    else:
        batch_end = min(batch_start + batch_size, num_samples)
    if batch_end == num_samples:
        done = True

iteration 0 logprob -375.31145158955746
iteration 1 logprob -162.9105377276921
iteration 2 logprob -158.39508547507322
iteration 3 logprob -157.3659213761136
iteration 4 logprob -157.04780893407343
iteration 5 logprob -156.9294278348261
iteration 6 logprob -156.87896333734219
iteration 7 logprob -156.8550330254664
iteration 8 logprob -156.84266060557786
iteration 9 logprob -156.83578858614396
iteration 10 logprob -156.83173446860732
iteration 11 logprob -156.82921693244157
iteration 12 logprob -156.82758329767347
iteration 13 logprob -156.82648219088213
iteration 14 logprob -156.82571513026403
iteration 0 logprob -356.7431906496996
iteration 1 logprob -181.90721328354203
iteration 2 logprob -179.96846393447635
iteration 3 logprob -179.5411789075788
iteration 4 logprob -179.4106203603169
iteration 5 logprob -179.36223601698995
iteration 6 logprob -179.34164467149765
iteration 7 logprob -179.33188751289956
iteration 8 logprob -179.3268447265735
iteration 9 logprob -179.3240443549587
iter

KeyboardInterrupt: 

In [15]:
# Extract model parameters
tags_bw = hmm.states  # Set of all possible tags
words = hmm.vocab # Set of all possible words

# convert probability distributions of HMM to dictionaries
transitions = {}
transition_probs = hmm.transition_probs
for prev_idx, prev_state in enumerate(tags_bw):
    transitions[prev_state] = {}
    for next_idx, next_state in enumerate(tags_bw):
        transitions[prev_state][next_state] = transition_probs[prev_idx, next_idx]

emissions = {}
emission_probs = hmm.emission_probs
for state_idx, state in enumerate(tags_bw):
    emissions[state] = {}
    for word_idx, word in enumerate(words):
        emissions[state][word] = emission_probs[state_idx, word_idx]

initial = {}
initial_probs = hmm.initial_probs
for state_idx, state in enumerate(tags_bw):
    initial[state] = initial_probs[state_idx]

In [16]:
tagger = Predictor(tags_bw, transitions, emissions, initial)

In [17]:
predictions = [tagger.viterbi(sequence) for sequence in train]

In [18]:
hmm.update_state_to_tag_mapping(int_training_data, predictions)

In [19]:
hmm.save_hmm()


In [20]:

# # use the model for prediction
# new_sentence = ["The", "cat", "jumps", "over", "the", "fence", "."]
# predicted_tag_indices = hmm.decode_to_pos_tags(new_sentence)

# # Convert numeric tags back to readable format
# predicted_tags = [idx_to_tag[idx] for idx in predicted_tag_indices]

# # Print the results
# print("\nPrediction for new sentence:")
# for word, tag in zip(new_sentence, predicted_tags):
#     print(f"{word}: {tag}")

# # print  the state-to-tag mapping
# print("\nState-to-tag mapping:")
# for state, tag_idx in sorted(hmm.state_to_tag_mapping.items()):
#     print(f"State {state} maps to {idx_to_tag[tag_idx]}")

