In [16]:
import sys
import os

# Add the scripts folder to the Python path
sys.path.append(os.path.abspath("../scripts"))  # Adjust the path accordingly

In [23]:
from load_preprocessed_data import load_ladino_pos

weak_dataset_file_path = '../data/weak/ladino-pos.txt'
weak_tags, weak_tags_dict = load_ladino_pos(weak_dataset_file_path) # import ladino tokens into custom data definition

../data/weak/ladino-pos.txt


In [24]:
# check it worked
for item in weak_tags[0]:
    print(item)

¡ (PUNCT)
Venidos (NOUN)
buenos (ADJ)
! (PUNCT)


In [20]:
from datasets import load_dataset

labeled_data = weak_tags
unlabeled_data = load_dataset("collectivat/salom-ladino-articles")['train']
train = unlabeled_data[:10300]['text']
validate = unlabeled_data[10301:10500]['text']
test = unlabeled_data[10501:10685]['text']

# Prepare data

## Count tags and emissions

In [51]:
print_results = False

"""
Count tags, tag transitions, and emissions of words to create the proper probability tables:
P(Tag)
P(Tag_{i} | Tag_{i-1})
P(Word | Tag)
"""
def create_count_dictionaries(data):
    tag_counts = {} # P(Tag)
    tag_transition_counts = {} # P(Tag_{i} | Tag_{i-1})
    emission_counts = {} # P(Word | Tag)
    # go through each sentence in the data
    for sentence in data:
        tags_sequence = [word.get_pos() for word in sentence]
        words_sequence = [word.get_word() for word in sentence]
        prev_tag = "<s>" # all sentences start with delimiter
        # go through each word and tag
        for word, tag in zip(words_sequence, tags_sequence):
            # P(Tag)
            tag_counts[tag] = tag_counts.get(tag, 0) + 1

            # P(Tag_{i} | Tag_{i-1})
            tag_transition = (prev_tag, tag) # make key to indicate transitioning from the previous tag to current
            tag_transition_counts[tag_transition] = tag_transition_counts.get(tag_transition, 0) + 1
            prev_tag = tag

            # P(Word | Tag)
            emission = (tag, word)
            emission_counts[emission] = emission_counts.get(emission, 0) + 1
        
        # P(Tag_{i} | Tag_{i-1}) only for the end of the sentence
        tag_transition = (prev_tag, "<s/>") # all sentences end with delimiter
        tag_transition_counts[tag_transition] = tag_transition_counts.get(tag_transition, 0) + 1
    return tag_counts, tag_transition_counts, emission_counts

tag_counts, tag_transition_counts, emission_counts = create_count_dictionaries(weak_tags)

if print_results:
    tag_counts = sorted(tag_counts.items(), key=lambda item: item[1], reverse=True)
    tag_transition_counts = sorted(tag_transition_counts.items(), key=lambda item: item[1], reverse=True)
    emission_counts = sorted(emission_counts.items(), key=lambda item: item[1], reverse=True)
    print(tag_counts)
    print(tag_transition_counts)
    print(emission_counts)


## Make actual probability tables out of counts

In [None]:
import numpy as np
import pandas as pd

print_results = True

tags_matrix = np.zeros((len(tag_counts), len(tag_counts)), dtype='float32')
tags_total_count = sum(tag_transition_counts.values())
tags = list(tag_counts.keys())
for i in range(len(tag_counts)):
    for j in range(len(tag_counts)):
        count_of_transition = tag_transition_counts.get((tags[i], tags[j]), 0)
        tags_matrix[i, j] = count_of_transition/tags_total_count


# need to create table for emission probabilities too? TODO


In [None]:
tags_matrix_df = pd.DataFrame(tags_matrix, columns = list(tag_counts), index=list(tag_counts))
tags_matrix_df

Unnamed: 0,PUNCT,NOUN,ADJ,PROPN,VERB,ADP,PRON,INTJ,ADV,CCONJ,AUX,DET,SCONJ,NUM,PREP,SYM
PUNCT,0.000712,0.001187,0.000237,0.002136,0.001424,0.000712,0.001187,0.0,0.000949,0.0,0.000475,0.000949,0.0,0.001424,0.0,0.0
NOUN,0.061936,0.000475,0.014001,0.001187,0.004509,0.009255,0.000475,0.0,0.005458,0.004271,0.020171,0.000237,0.001187,0.0,0.0,0.000237
ADJ,0.046512,0.003322,0.00261,0.0,0.000475,0.003322,0.000475,0.0,0.001187,0.001661,0.000237,0.0,0.000475,0.0,0.0,0.0
PROPN,0.02373,0.001187,0.003797,0.00617,0.008306,0.001898,0.001898,0.0,0.005221,0.000712,0.014713,0.003322,0.000237,0.0,0.000237,0.0
VERB,0.016137,0.009018,0.003322,0.00356,0.007119,0.014001,0.000949,0.0,0.002848,0.0,0.0,0.019934,0.00356,0.000237,0.0,0.000237
ADP,0.000475,0.008543,0.000712,0.011865,0.002136,0.000949,0.004509,0.0,0.0,0.0,0.000475,0.011865,0.0,0.0,0.0,0.0
PRON,0.00617,0.002373,0.000237,0.000475,0.022781,0.000949,0.003085,0.0,0.002373,0.0,0.011153,0.001187,0.000237,0.0,0.0,0.0
INTJ,0.001424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ADV,0.007119,0.0,0.00356,0.000237,0.007594,0.000237,0.004509,0.0,0.001424,0.0,0.011153,0.000237,0.000237,0.0,0.0,0.0
CCONJ,0.0,0.000949,0.000712,0.000237,0.0,0.000237,0.000475,0.0,0.0,0.0,0.0,0.004271,0.0,0.0,0.0,0.0


# Create HMM

In [None]:
# Include some rules so unknown verbs can be classified easily
# Credit to https://aclanthology.org/C96-1011.pdf for some of the rules
rules = [
    (r'.*(ando|endo)$', 'VERB'), # verbs in gerund
    (r'.*(ido|ado|ida|ada)$', 'VERB'), # verbs in continuous
    (r'.*(er|ir|ar)$', 'VERB'), # verbs in infinitive
    (r'.*(erse|irse|arse)$', 'VERB'), # verbs in infinitive reflexive
    (r'.*mente$', 'ADV'), # -mente suffix is for adverbs 
    (r'^-?[0-9]+(.[0-9]+)?\.*$', 'NUM'), # numbers
    (r'(el|El|eya|Eya|Yo|yo)$', 'PRON'), # pronouns
    (r'[!\"#\$%&\'\(\)\*\+,\-.\/:;<=>\?@\[\\\]\^_`{\|}~]  ', 'PUNCT'), # punctuation   
]

## Initialize HMM with counts