In [35]:
import sys
import os

# Add the scripts folder to the Python path
sys.path.append(os.path.abspath("../scripts"))  # Adjust the path accordingly

In [36]:
import numpy as np
import pandas as pd

In [37]:
from load_preprocessed_data import load_ladino_pos

weak_dataset_file_path = '../data/weak/ladino-pos.txt'
weak_tags, weak_tags_dict = load_ladino_pos(weak_dataset_file_path) # import ladino tokens into custom data definition

../data/weak/ladino-pos.txt


In [46]:
# check it worked
print(weak_tags_dict)

defaultdict(<class 'collections.Counter'>, {'¡': Counter({'PUNCT': 1}), 'Venidos': Counter({'NOUN': 1}), 'buenos': Counter({'ADJ': 1}), '!': Counter({'PUNCT': 35}), 'Kim': Counter({'PROPN': 1}), 'bive': Counter({'VERB': 1}), 'kon': Counter({'ADP': 25}), 'Ken': Counter({'PRON': 3, 'PROPN': 1}), '.': Counter({'PUNCT': 556}), 'Me': Counter({'PRON': 22, 'DET': 1}), 'yamo': Counter({'VERB': 6}), 'Ichiro': Counter({'PROPN': 1}), 'Tanaka': Counter({'PROPN': 1}), 'Eyos': Counter({'PRON': 3, 'AUX': 1}), 'me': Counter({'PRON': 11}), 'yaman': Counter({'VERB': 2}), 'Bob': Counter({'PROPN': 2}), 'Wang': Counter({'PROPN': 1}), 'Si': Counter({'INTJ': 2}), ',': Counter({'PUNCT': 27, 'NUM': 3, 'CCONJ': 1}), 'Karen': Counter({'PROPN': 1}), 'Smith': Counter({'PROPN': 1}), 'Ninguno': Counter({'ADV': 1, 'PROPN': 1}), 'no': Counter({'ADV': 53}), 'vino': Counter({'VERB': 2, 'PRON': 1}), 'Avlo': Counter({'PROPN': 3, 'VERB': 1}), 'japanez': Counter({'ADJ': 1}), 'inglez': Counter({'NOUN': 4, 'ADJ': 4}), 'i': Co

In [39]:
from datasets import load_dataset

labeled_data = weak_tags
unlabeled_data = load_dataset("collectivat/salom-ladino-articles")['train']
train = unlabeled_data[:10300]['text']
validate = unlabeled_data[10301:10500]['text']
test = unlabeled_data[10501:10685]['text']

# Prepare data

## Count tags and emissions

In [56]:
print_results = False

"""
Count tags, tag transitions, and emissions of words to create the proper probability tables:
P(Tag)
P(Tag_{i} | Tag_{i-1})
P(Word | Tag)
"""
def create_count_dictionaries(data):
    tag_counts = {} # P(Tag)
    tag_transition_counts = {} # P(Tag_{i} | Tag_{i-1})
    # emission_counts = {} # P(Word | Tag)
    # go through each sentence in the data
    for sentence in data:
        tags_sequence = [word.get_pos() for word in sentence]
        words_sequence = [word.get_word() for word in sentence]
        prev_tag = "<s>" # all sentences start with delimiter
        # go through each word and tag
        for word, tag in zip(words_sequence, tags_sequence):
            # P(Tag)
            tag_counts[tag] = tag_counts.get(tag, 0) + 1

            # P(Tag_{i} | Tag_{i-1})
            tag_transition = (prev_tag, tag) # make key to indicate transitioning from the previous tag to current
            tag_transition_counts[tag_transition] = tag_transition_counts.get(tag_transition, 0) + 1
            prev_tag = tag

            # # P(Word | Tag)
            # emission = (tag, word)
            # emission_counts[emission] = emission_counts.get(emission, 0) + 1
        
        # P(Tag_{i} | Tag_{i-1}) only for the end of the sentence
        tag_transition = (prev_tag, "<s/>") # all sentences end with delimiter
        tag_transition_counts[tag_transition] = tag_transition_counts.get(tag_transition, 0) + 1
    return tag_counts, tag_transition_counts

tag_counts, tag_transition_counts = create_count_dictionaries(weak_tags)

if print_results:
    tag_counts = sorted(tag_counts.items(), key=lambda item: item[1], reverse=True)
    tag_transition_counts = sorted(tag_transition_counts.items(), key=lambda item: item[1], reverse=True)
    # emission_counts = sorted(emission_counts.items(), key=lambda item: item[1], reverse=True)
    print(tag_counts)
    print(tag_transition_counts)
    # print(emission_counts)


## Make actual probability tables out of counts

Create a matrix for Tag -> Tag transitions

In [57]:
print_results = True

tags_matrix = np.zeros((len(tag_counts), len(tag_counts)), dtype='float32')

tags = sorted({tag for counter in weak_tags_dict.values() for tag in counter}) # columns
tag_to_index = {tag: j for j, tag in enumerate(tags)}

for tag_1 in tags:
    for tag_2 in tags:
        i = tag_to_index[tag_1]
        j = tag_to_index[tag_2]
        count_of_transition = tag_transition_counts.get((tag_1, tag_2), 0)
        tags_matrix[i, j] = count_of_transition/tag_counts.get(tag_1)

# need to create table for emission probabilities too? TODO


In [61]:
tags_matrix_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
tags_matrix_df

Unnamed: 0,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PREP,PRON,PROPN,PUNCT,SCONJ,SYM,VERB
ADJ,0.043307,0.055118,0.019685,0.003937,0.027559,0.0,0.0,0.055118,0.0,0.0,0.007874,0.0,0.771654,0.007874,0.0,0.007874
ADP,0.017143,0.022857,0.0,0.011429,0.0,0.285714,0.0,0.205714,0.0,0.0,0.108571,0.285714,0.011429,0.0,0.0,0.051429
ADV,0.098039,0.006536,0.039216,0.30719,0.0,0.006536,0.0,0.0,0.0,0.0,0.124183,0.006536,0.196078,0.006536,0.0,0.20915
AUX,0.288344,0.046012,0.052147,0.015337,0.0,0.328221,0.0,0.104294,0.0,0.0,0.01227,0.027607,0.01227,0.003067,0.0,0.110429
CCONJ,0.103448,0.034483,0.0,0.0,0.0,0.62069,0.0,0.137931,0.0,0.0,0.068966,0.034483,0.0,0.0,0.0,0.0
DET,0.077434,0.004425,0.0,0.00885,0.0,0.017699,0.0,0.798673,0.011062,0.0,0.004425,0.05531,0.004425,0.0,0.0,0.017699
INTJ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
NOUN,0.113462,0.075,0.044231,0.163462,0.034615,0.001923,0.0,0.003846,0.0,0.0,0.003846,0.009615,0.501923,0.009615,0.001923,0.036538
NUM,0.0,0.0,0.0,0.222222,0.0,0.0,0.0,0.055556,0.333333,0.0,0.0,0.0,0.388889,0.0,0.0,0.0
PREP,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Create a matrix for Tag -> Word probabilities

In [59]:
words = list(weak_tags_dict.keys())  # rows
# columns are "tags" defined in previous cell

# create mapping of words and tags to an index so that we can
# add to the correct tag/word every time we are updating the matrix5
word_to_index = {word: i for i, word in enumerate(words)}

emission_matrix = np.zeros((len(words), len(tags)))

for word, counter in weak_tags_dict.items():
    for tag, count in counter.items():
        emission_matrix[word_to_index[word], tag_to_index[tag]] = count


emission_matrix = emission_matrix / emission_matrix.sum(axis=1, keepdims=True)

# due to processing of data, some rows are NaN, replace them 
# so they don't affect later calculations
# TODO fix data processing so this doesn't happen
emission_matrix = np.nan_to_num(emission_matrix, nan=1e-6)

  emission_matrix = emission_matrix / emission_matrix.sum(axis=1, keepdims=True)


In [60]:
ems_matrix_df = pd.DataFrame(emission_matrix, columns = list(tags), index=list(words))
ems_matrix_df

Unnamed: 0,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PREP,PRON,PROPN,PUNCT,SCONJ,SYM,VERB
¡,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
Venidos,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
buenos,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
!,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
Kim,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Kaminos,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
leche,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
miel,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
anarkista,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


Create initial probabilities matrix (the probability a sentence starts with a tag)

In [91]:
initial_probs = np.zeros(len(tags))
for i in range(len(tags)):
    prob = tag_transition_counts.get(('<s>', tags[i]), 0)
    initial_probs[i] = prob

initial_probs = initial_probs / initial_probs.sum()

print(tags)
print(initial_probs)

['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PREP', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB']
[0.00298063 0.0342772  0.0804769  0.09985097 0.00149031 0.22652757
 0.00894188 0.00894188 0.         0.         0.18777943 0.23099851
 0.01788376 0.00447094 0.         0.09538003]


In [92]:
initial_probs_df = pd.DataFrame([initial_probs], columns = tags)
initial_probs_df

Unnamed: 0,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PREP,PRON,PROPN,PUNCT,SCONJ,SYM,VERB
0,0.002981,0.034277,0.080477,0.099851,0.00149,0.226528,0.008942,0.008942,0.0,0.0,0.187779,0.230999,0.017884,0.004471,0.0,0.09538


# Create HMM