In [None]:
import sys
import os

# Add the scripts folder to the Python path
sys.path.append(os.path.abspath("../scripts"))  # Adjust the path accordingly

In [None]:
from load_preprocessed_data import load_ladino_pos

weak_dataset_file_path = '../data/weak/ladino-pos.txt'
weak_tags, weak_tags_dict = load_ladino_pos(weak_dataset_file_path) # import ladino tokens into custom data definition

In [None]:
# check it worked
print(weak_tags_dict)

In [None]:
from datasets import load_dataset

labeled_data = weak_tags
unlabeled_data = load_dataset("collectivat/salom-ladino-articles")['train']
train = unlabeled_data[:10300]['text']
validate = unlabeled_data[10301:10500]['text']
test = unlabeled_data[10501:10685]['text']

train = [sentence.split() for sentence in train]
validate = [sentence.split() for sentence in validate]
test = [sentence.split() for sentence in test]

## Count tags and emissions

In [None]:
print_results = False

"""
Count tags, tag transitions, and emissions of words to create the proper probability tables:
P(Tag)
P(Tag_{i} | Tag_{i-1})
P(Word | Tag)
"""
def create_count_dictionaries(data):
    tag_counts = {} # P(Tag)
    tag_transition_counts = {} # P(Tag_{i} | Tag_{i-1})
    # emission_counts = {} # P(Word | Tag)
    # go through each sentence in the data
    for sentence in data:
        tags_sequence = [word.get_pos() for word in sentence]
        words_sequence = [word.get_word() for word in sentence]
        prev_tag = "<s>" # all sentences start with delimiter
        # go through each word and tag
        for _, tag in zip(words_sequence, tags_sequence):
            # P(Tag)
            tag_counts[tag] = tag_counts.get(tag, 0) + 1

            # P(Tag_{i} | Tag_{i-1})
            tag_transition = (prev_tag, tag) # make key to indicate transitioning from the previous tag to current
            tag_transition_counts[tag_transition] = tag_transition_counts.get(tag_transition, 0) + 1
            prev_tag = tag
        
        # P(Tag_{i} | Tag_{i-1}) only for the end of the sentence
        tag_transition = (prev_tag, "<s/>") # all sentences end with delimiter
        tag_transition_counts[tag_transition] = tag_transition_counts.get(tag_transition, 0) + 1
    return tag_counts, tag_transition_counts

tag_counts, tag_transition_counts = create_count_dictionaries(weak_tags)

if print_results:
    tag_counts = sorted(tag_counts.items(), key=lambda item: item[1], reverse=True)
    tag_transition_counts = sorted(tag_transition_counts.items(), key=lambda item: item[1], reverse=True)
    # emission_counts = sorted(emission_counts.items(), key=lambda item: item[1], reverse=True)
    print(tag_counts)
    print(tag_transition_counts)
    # print(emission_counts)


## Make actual probability tables out of counts

Create a matrix for Tag -> Tag transitions

In [None]:
print_results = True

tags_matrix = np.zeros((len(tag_counts), len(tag_counts)), dtype='float32')

tags = sorted({tag for counter in weak_tags_dict.values() for tag in counter}) # columns
tag_to_index = {tag: j for j, tag in enumerate(tags)}

for tag_1 in tags:
    for tag_2 in tags:
        i = tag_to_index[tag_1]
        j = tag_to_index[tag_2]
        count_of_transition = tag_transition_counts.get((tag_1, tag_2), 0)
        tags_matrix[i, j] = count_of_transition/tag_counts.get(tag_1)

# need to create table for emission probabilities too? TODO


In [None]:
tags_matrix_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
tags_matrix_df

Create a matrix for Tag -> Word probabilities

In [None]:
words = list(weak_tags_dict.keys())  # rows
# columns are "tags" defined in previous cell

# create mapping of words and tags to an index so that we can
# add to the correct tag/word every time we are updating the matrix5
word_to_index = {word: i for i, word in enumerate(words)}

emission_matrix = np.zeros((len(words), len(tags)))

for word, counter in weak_tags_dict.items():
    for tag, count in counter.items():
        emission_matrix[word_to_index[word], tag_to_index[tag]] = count


emission_matrix = emission_matrix / emission_matrix.sum(axis=1, keepdims=True)

# due to processing of data, some rows are NaN, replace them 
# so they don't affect later calculations
# TODO fix data processing so this doesn't happen
emission_matrix = np.nan_to_num(emission_matrix, nan=1e-6)

In [None]:
ems_matrix_df = pd.DataFrame(emission_matrix, columns = list(tags), index=list(words))
ems_matrix_df

Create initial probabilities matrix (the probability a sentence starts with a tag)

In [None]:
initial_probs = np.zeros(len(tags))
for i in range(len(tags)):
    prob = tag_transition_counts.get(('<s>', tags[i]), 0)
    initial_probs[i] = prob

initial_probs = initial_probs / initial_probs.sum()

print(tags)
print(initial_probs)

In [None]:
initial_probs_df = pd.DataFrame([initial_probs], columns = tags)
initial_probs_df

# Create HMM

In [None]:
import hmm

hmm_tagger = hmm.HMMTagger(tags, words, smoothing=2.0)
hmm_tagger.initialize_probabilities(tags_matrix, emission_matrix, initial_probs)

## Train HMM

In [None]:
hmm_tagger.train_em(train[:500])

In [None]:
result = hmm_tagger.viterbi(test[0])
print(test[0])
print(result)