In [None]:
import sys
import os

# Add the scripts folder to the Python path
sys.path.append(os.path.abspath("../scripts"))  # Adjust the path accordingly

In [None]:
import pandas as pd
import numpy as np
from preprocessing import *

In [None]:
train, test = load_tagged_sentences("../data/brown-universal.txt", split=0.8)
tags = load_tags("../data/tags-universal.txt")

In [None]:
print("There are {} sentences in the training set.".format(len(train)))
print("There are {} sentences in the testing set.".format(len(test)))

In [None]:
# partition train so only a few of the samples are used for the initial probabilities
train_sample = train[:500]

In [None]:
print_results = False

"""
Count tags, tag transitions, and emissions of words to create the proper probability tables:
P(Tag)
P(Tag_{i} | Tag_{i-1})
P(Word | Tag)
"""
def create_count_dictionaries(data):
    tag_counts = {} # P(Tag)
    tag_transition_counts = {} # P(Tag_{i} | Tag_{i-1})
    # go through each sentence in the data
    for sentence in data:
        tags_sequence = [word.get_pos() for word in sentence]
        words_sequence = [word.get_word() for word in sentence]
        prev_tag = "<s>" # all sentences start with delimiter
        # go through each word and tag
        for _, tag in zip(words_sequence, tags_sequence):
            # P(Tag)
            tag_counts[tag] = tag_counts.get(tag, 0) + 1

            # P(Tag_{i} | Tag_{i-1})
            tag_transition = (prev_tag, tag) # make key to indicate transitioning from the previous tag to current
            tag_transition_counts[tag_transition] = tag_transition_counts.get(tag_transition, 0) + 1
            prev_tag = tag
        
        # P(Tag_{i} | Tag_{i-1}) only for the end of the sentence
        tag_transition = (prev_tag, "<s/>") # all sentences end with delimiter
        tag_transition_counts[tag_transition] = tag_transition_counts.get(tag_transition, 0) + 1
    return tag_counts, tag_transition_counts

tag_counts, tag_transition_counts = create_count_dictionaries(train_sample)

if print_results:
    tag_counts = sorted(tag_counts.items(), key=lambda item: item[1], reverse=True)
    tag_transition_counts = sorted(tag_transition_counts.items(), key=lambda item: item[1], reverse=True)
    print(tag_counts)
    print(tag_transition_counts)


## Make actual probability tables out of counts

Create a matrix for Tag -> Tag transitions

In [None]:
tags = sorted(tags) # columns
num_tags = len(tags)

tags_matrix = np.zeros((num_tags, num_tags), dtype='float32')

tag_to_index = {tag: j for j, tag in enumerate(tags)}

for tag_1 in tags:
    for tag_2 in tags:
        i = tag_to_index[tag_1]
        j = tag_to_index[tag_2]
        count_of_transition = tag_transition_counts.get((tag_1, tag_2), 0)
        tags_matrix[i, j] = count_of_transition/tag_counts.get(tag_1)


tags_matrix = np.where(tags_matrix == 0.0, 1e-6, tags_matrix)
tags_matrix = np.log(tags_matrix)


In [None]:
tags_matrix_df = pd.DataFrame(tags_matrix, columns = tags, index=tags)
tags_matrix_df = np.exp(tags_matrix_df)
tags_matrix_df

Create a matrix for Tag -> Word probabilities

In [None]:
tags_dict = word_to_tag_counts(train)
words = list(tags_dict.keys())  # rows
# columns are "tags" defined in previous cell

# create mapping of words and tags to an index so that we can
# add to the correct tag/word every time we are updating the matrix5
word_to_index = {word: i for i, word in enumerate(words)}

emission_matrix = np.zeros((len(tags), len(words)))

for word, counter in tags_dict.items():
    for tag, count in counter.items():
        emission_matrix[tag_to_index[tag], word_to_index[word]] = count


emission_matrix = emission_matrix / emission_matrix.sum(axis=1, keepdims=True)

emission_matrix = np.where(emission_matrix == 0.0, 1e-6, emission_matrix)
emission_matrix = np.log(emission_matrix)

In [None]:
ems_matrix_df = pd.DataFrame(emission_matrix, columns = list(words), index=list(tags))
ems_matrix_df = np.exp(ems_matrix_df)
ems_matrix_df

Create initial probabilities matrix (the probability a sentence starts with a tag)

In [None]:
initial_probs = np.zeros(len(tags))
for i in range(len(tags)):
    prob = tag_transition_counts.get(('<s>', tags[i]), 0)
    initial_probs[i] = prob

initial_probs = initial_probs / initial_probs.sum()

initial_probs = np.where(initial_probs == 0.0, 1e-6, initial_probs)
initial_probs = np.log(initial_probs)

In [None]:
initial_probs_df = pd.DataFrame([initial_probs], columns = tags)
initial_probs_df = np.exp(initial_probs_df)
initial_probs_df