In [16]:
import sys
import os

# Add the scripts folder to the Python path
sys.path.append(os.path.abspath("../scripts"))  # Adjust the path accordingly

In [23]:
from load_preprocessed_data import load_ladino_pos

weak_dataset_file_path = '../data/weak/ladino-pos.txt'
weak_tags, weak_tags_dict = load_ladino_pos(weak_dataset_file_path) # import ladino tokens into custom data definition

../data/weak/ladino-pos.txt


In [24]:
# check it worked
for item in weak_tags[0]:
    print(item)

¡ (PUNCT)
Venidos (NOUN)
buenos (ADJ)
! (PUNCT)


In [20]:
from datasets import load_dataset

labeled_data = weak_tags
unlabeled_data = load_dataset("collectivat/salom-ladino-articles")['train']
train = unlabeled_data[:10300]['text']
validate = unlabeled_data[10301:10500]['text']
test = unlabeled_data[10501:10685]['text']

In [None]:
# import spacy

# nlp = spacy.load("es_core_news_sm")
# weakly_labeled = []

# for sentence in train:
#     doc = nlp(sentence)
#     tagged_sentence = [(token.text, token.pos_) for token in doc]
#     weakly_labeled.append(tagged_sentence)


# Prepare probabilities of tags to pass into HMM

In [None]:
verify_results = False

"""
Count tags, tag transitions, and emissions of words to create the proper probability tables:
P(Tag)
P(Tag_{i} | Tag_{i-1})
P(Word | Tag)
"""
def create_count_dictionaries(data):
    tag_counts = {} # P(Tag)
    tag_transition_counts = {} # P(Tag_{i} | Tag_{i-1})
    emission_counts = {} # P(Word | Tag)
    # go through each sentence in the data
    for sentence in data:
        tags_sequence = [word.get_pos() for word in sentence]
        words_sequence = [word.get_word() for word in sentence]
        prev_tag = "<s>" # all sentences start with delimiter
        # go through each word and tag
        for word, tag in zip(words_sequence, tags_sequence):
            # P(Tag)
            tag_counts[tag] = tag_counts.get(tag, 0) + 1

            # P(Tag_{i} | Tag_{i-1})
            tag_transition = (prev_tag, tag) # make key to indicate transitioning from the previous tag to current
            tag_transition_counts[tag_transition] = tag_transition_counts.get(tag_transition, 0) + 1
            prev_tag = tag

            # P(Word | Tag)
            emission = (tag, word)
            emission_counts[emission] = emission_counts.get(emission, 0) + 1
        
        # P(Tag_{i} | Tag_{i-1}) only for the end of the sentence
        tag_transition = (prev_tag, "<s/>") # all sentences end with delimiter
        tag_counts[tag_transition] = tag_transition_counts.get(tag_transition, 0) + 1
    return tag_counts, tag_transition_counts, emission_counts

tag_counts, tag_transition_counts, emission_counts = create_count_dictionaries(weak_tags)

if verify_results:
    tag_counts = sorted(tag_counts.items(), key=lambda item: item[1], reverse=True)
    tag_transition_counts = sorted(tag_transition_counts.items(), key=lambda item: item[1], reverse=True)
    emission_counts = sorted(emission_counts.items(), key=lambda item: item[1], reverse=True)
    print(tag_counts)
    print(tag_transition_counts)
    print(emission_counts)


[('PUNCT', 719), ('NOUN', 520), ('DET', 452), ('VERB', 341), ('AUX', 326), ('PROPN', 301), ('ADJ', 254), ('PRON', 215), ('ADP', 175), ('ADV', 153), ('SCONJ', 31), ('CCONJ', 29), ('NUM', 18), ('INTJ', 6), ('SYM', 2), (('PUNCT', '<s/>'), 1), ('PREP', 1)]
[(('DET', 'NOUN'), 361), (('NOUN', 'PUNCT'), 261), (('ADJ', 'PUNCT'), 196), (('<s>', 'PROPN'), 155), (('<s>', 'DET'), 152), (('<s>', 'PRON'), 126), (('AUX', 'DET'), 107), (('PROPN', 'PUNCT'), 100), (('PRON', 'VERB'), 96), (('AUX', 'ADJ'), 94), (('NOUN', 'AUX'), 85), (('VERB', 'DET'), 84), (('VERB', 'PUNCT'), 68), (('<s>', 'AUX'), 67), (('<s>', 'VERB'), 64), (('PROPN', 'AUX'), 62), (('NOUN', 'ADJ'), 59), (('VERB', 'ADP'), 59), (('<s>', 'ADV'), 54), (('ADP', 'PROPN'), 50), (('ADP', 'DET'), 50), (('PRON', 'AUX'), 47), (('ADV', 'AUX'), 47), (('NOUN', 'ADP'), 39), (('VERB', 'NOUN'), 38), (('ADP', 'NOUN'), 36), (('AUX', 'VERB'), 36), (('PROPN', 'VERB'), 35), (('DET', 'ADJ'), 35), (('AUX', 'NOUN'), 34), (('ADV', 'VERB'), 32), (('ADV', 'PUNCT'),