In [1]:
import sys
import os

# Add the scripts folder to the Python path
sys.path.append(os.path.abspath("../scripts"))  # Adjust the path accordingly

### Tokenize Data and Prepare it for FastAlign

In [2]:
from preprocessing import preprocess


ladino_file_path = '../data/tatoeba.spa-lad.lad'
spanish_file_path = '../data/tatoeba.spa-lad.spa'
parallel_file_path = '../data/preprocessing/parallel_spa-lad.txt'
spanish_pos_file_path = '../data/preprocessing/spanish_pos.txt'

preprocess(ladino_file_path, spanish_file_path, parallel_file_path, spanish_pos_file_path)

### Align Tokens using FastAlign

In [3]:
!git clone https://github.com/clab/fast_align.git
!cd fast_align && mkdir build && cd build && cmake .. && make

fatal: destination path 'fast_align' already exists and is not an empty directory.
mkdir: cannot create directory ‘build’: File exists


In [7]:
! fast_align/build/fast_align -i ../data/preprocessing/parallel_spa-lad.txt -d -o -v > ../data/preprocessing/alignments.txt

ARG=i
ARG=d
ARG=o
ARG=v
INITIAL PASS 
expected target length = source length * 1.04245
ITERATION 1
  log_e likelihood: -74562.3
  log_2 likelihood: -107571
     cross entropy: 29.8974
        perplexity: 1e+09
      posterior p0: 0.08
 posterior al-feat: -0.226178
       size counts: 49
ITERATION 2
  log_e likelihood: -8785.95
  log_2 likelihood: -12675.4
     cross entropy: 3.52291
        perplexity: 11.4948
      posterior p0: 0.0112663
 posterior al-feat: -0.186326
       size counts: 49
  1  model al-feat: -0.150993 (tension=4)
  2  model al-feat: -0.171971 (tension=3.29333)
  3  model al-feat: -0.181401 (tension=3.00622)
  4  model al-feat: -0.18476 (tension=2.90771)
  5  model al-feat: -0.185841 (tension=2.87638)
  6  model al-feat: -0.186177 (tension=2.86668)
  7  model al-feat: -0.186281 (tension=2.8637)
  8  model al-feat: -0.186312 (tension=2.86279)
     final tension: 2.86251
ITERATION 3
  log_e likelihood: -5552.03
  log_2 likelihood: -8009.89
     cross entropy: 2.22621
 

### Transfer POS Tags from Spanish to Ladino using alignments

Transfer tags for one sentence

In [29]:
def transfer_pos_tag(ladino_sent, alignment, pos_tags):
    """Takes in a sentence from Ladino, alignments with Spanish, and the Spanish Part of Speech tags
    and transfers the tags to the Ladino sentence
    """
    ladino_pos = ["UNK"] * len(ladino_sent) # tags are unknown by default

    for lad_idx, spa_idx in alignment:
        if spa_idx < len(pos_tags):  # Ensure alignment is within bounds
            ladino_pos[lad_idx] = pos_tags[spa_idx]
    
    return ladino_pos

Create file to keep track of Ladino POS tags

In [30]:
from load_preprocessed_data import load_alignments, load_parallel_data, load_spanish_pos
alignments = load_alignments('../data/preprocessing/alignments.txt')
ladino_tokens, spanish_tokens = load_parallel_data(parallel_file_path)
spanish_pos = load_spanish_pos(spanish_pos_file_path)

with open('../data/weak/ladino-pos.txt', "w", encoding="utf-8") as f:
        for sentence, alignment, pos_tags in zip(ladino_tokens, alignments, spanish_pos):
            pos_tags = transfer_pos_tag(sentence, alignment, pos_tags)
            formatted = " ".join(f"{word} ({tag})" for word, tag in zip(sentence, pos_tags)) + "\n"
            f.write(formatted)
        f.close()