In [3]:
import sys
import os

# Add the scripts folder to the Python path
sys.path.append(os.path.abspath("../scripts"))  # Adjust the path accordingly

### Tokenize Data and Prepare it for FastAlign

In [4]:
from preprocessing import preprocess


ladino_file_path = '../data/tatoeba.spa-lad.lad'
spanish_file_path = '../data/tatoeba.spa-lad.spa'
parallel_file_path = '../data/preprocessing/parallel_spa-lad.txt'
spanish_pos_file_path = '../data/preprocessing/spanish_pos.txt'
weak_dataset_file_path = '../data/weak/ladino-pos.txt'

load_data = True # set True if you don't have the preprocessed files, otherwise just load them to save time
if not load_data:
    preprocess(ladino_file_path, spanish_file_path, parallel_file_path, spanish_pos_file_path)

### Align Tokens using FastAlign

In [5]:
!git clone https://github.com/clab/fast_align.git
!cd fast_align && mkdir build && cd build && cmake .. && make

fatal: destination path 'fast_align' already exists and is not an empty directory.
mkdir: cannot create directory ‘build’: File exists


In [6]:
! fast_align/build/fast_align -i ../data/preprocessing/parallel_spa-lad.txt -d -o -v > ../data/preprocessing/alignments.txt

ARG=i
ARG=d
ARG=o
ARG=v
INITIAL PASS 
expected target length = source length * 1.04245
ITERATION 1
  log_e likelihood: -74562.3
  log_2 likelihood: -107571
     cross entropy: 29.8974
        perplexity: 1e+09
      posterior p0: 0.08
 posterior al-feat: -0.226178
       size counts: 49
ITERATION 2
  log_e likelihood: -8788.09
  log_2 likelihood: -12678.5
     cross entropy: 3.52377
        perplexity: 11.5017
      posterior p0: 0.0112896
 posterior al-feat: -0.18629
       size counts: 49
  1  model al-feat: -0.150993 (tension=4)
  2  model al-feat: -0.171948 (tension=3.29405)
  3  model al-feat: -0.181367 (tension=3.00721)
  4  model al-feat: -0.184724 (tension=2.90876)
  5  model al-feat: -0.185805 (tension=2.87743)
  6  model al-feat: -0.186141 (tension=2.86773)
  7  model al-feat: -0.186244 (tension=2.86475)
  8  model al-feat: -0.186276 (tension=2.86384)
     final tension: 2.86356
ITERATION 3
  log_e likelihood: -5551.92
  log_2 likelihood: -8009.73
     cross entropy: 2.22616


### Transfer POS Tags from Spanish to Ladino using alignments

Transfer tags for one sentence

In [7]:
def transfer_pos_tag(ladino_sent, alignment, pos_tags):
    """Takes in a sentence from Ladino, alignments with Spanish, and the Spanish Part of Speech tags
    and transfers the tags to the Ladino sentence
    """
    ladino_pos = ["UNK"] * len(ladino_sent) # tags are unknown by default

    for lad_idx, spa_idx in alignment:
        if spa_idx < len(pos_tags):  # Ensure alignment is within bounds
            ladino_pos[lad_idx] = pos_tags[spa_idx]
    
    return ladino_pos

Create file to keep track of Ladino POS tags

In [8]:
from load_preprocessed_data import load_alignments, load_parallel_data, load_spanish_pos
alignments = load_alignments('../data/preprocessing/alignments.txt')
ladino_tokens, spanish_tokens = load_parallel_data(parallel_file_path)
spanish_pos = load_spanish_pos(spanish_pos_file_path)

if not load_data:
    with open('../data/weak/ladino-pos.txt', "w", encoding="utf-8") as f:
            for sentence, alignment, pos_tags in zip(ladino_tokens, alignments, spanish_pos):
                pos_tags = transfer_pos_tag(sentence, alignment, pos_tags)
                formatted = " ".join(f"{word} ({tag})" for word, tag in zip(sentence, pos_tags)) + "\n"
                f.write(formatted)
            f.close()

In [9]:
from load_preprocessed_data import load_ladino_pos

ladino_tokens, tags_dict = load_ladino_pos(weak_dataset_file_path) # import ladino tokens into custom data definition

../data/weak/ladino-pos.txt


In [10]:
print(len(ladino_tokens)) # check it worked
for item in ladino_tokens[10]:
    print(item)

671
Eya (PRON)
es (AUX)
muy (ADJ)
yakishikliya (NOUN)
. (PUNCT)


Correct the dataset so there are as few "unknown" tags as possible

In [11]:
# create a mapping of each word to its most frequent tag (excluding "unknown")
most_common_tags = {word: tags.most_common(1)[0][0] for word, tags in tags_dict.items() if tags}

# replace "unknown" tags
for sentence in ladino_tokens:
    for token in sentence:
        if token.get_pos() == "UNK" and token.get_word() in most_common_tags:
            token.correct_pos(most_common_tags[token.get_word()])  # replace with most common tag

# save corrected tags into file
with open('../data/weak/ladino-pos.txt', "w", encoding="utf-8") as f:
            for sentence in ladino_tokens:
                formatted = ""
                for token in sentence:
                      formatted += str(token) + " "
                formatted = formatted.strip() + "\n"
                f.write(formatted)
            f.close()


Once done, delete some variables to free up memory