In [None]:
import sys
import re
import time
import asyncio
import numpy as np
import pandas as pd
from itertools import chain

from tqdm import tqdm
from pathlib import Path
# from googletrans import Translator
from functools import partial
import importlib

sys.path.append("..")
from src import utils
from src.translator import Translator

base_path = Path("..") / "data"
paracrawl_pt_data_path = base_path / "parallel" / "paracrawl.en-pt" / "paracrawl.en-pt.pt.sample"
paracrawl_en_data_path = base_path / "parallel" / "paracrawl.en-pt" / "paracrawl.en-pt.en.sample"

en_pt_dict_path = base_path / "dicts" / "en-pt.json.sample"
pt_en_dict_path = base_path / "dicts" / "pt-en.json.sample"

# n_lines = 2809381
n_lines = 200

random_generator = np.random.default_rng()
remove_punct_regex = re.compile(r"[^\w\s]")
remove_punct = partial(remove_punct_regex.sub, repl = "")
translator = Translator(en_pt_dict_path, pt_en_dict_path)

In [None]:
chunk_size = 100
chunk = next(utils.read_paracrawl_in_chunks(en_data_path = paracrawl_en_data_path, pt_data_path = paracrawl_pt_data_path, chunk_size = chunk_size))
df = utils.filechunk_to_dataframe(chunk, split = False, pad = False)

In [None]:
df.head()

In [None]:
def create_crosslingual_phrase(phrases, src_lang, dest_lang, agressiveness = 0.5, enforce_match = True):
    # print(f"[ ] getting crosslingual rep for phrase {phrases[src_lang]}")
    original = phrases[src_lang].lower().split() 
    target = phrases[dest_lang]
    new_phrase = original.copy()
    indexes_to_translate = random_generator.choice(len(original), size = int(agressiveness*len(original)), replace=False)
    words_to_translate = [original[i] for i in indexes_to_translate]
    translations = translator.bulk_translate(words_to_translate, src_lang, dest_lang)
    
    for index, translation in zip(indexes_to_translate, translations):
        if not enforce_match or translation in target:
            new_phrase[index] = translation.lower()
    # print(f"[ ] done getting crosslingual rep for phrase {phrases[src_lang]}")

    return str.join(" ", new_phrase)

In [None]:
%%time
result = df.apply(lambda row: create_crosslingual_phrase(row, "en", "pt", agressiveness = 1, enforce_match = True), axis = 1)
# result = df.apply(lambda row: create_crosslingual_phrase(row, "pt", "en", agressiveness = 1, enforce_match = True), axis = 1)

In [None]:
idx = 2
print(df.loc[idx, "en"])
print(df.loc[idx, "pt"])
print(80*"-")
print(result.iloc[2])

In [None]:
src_lang = "en"
dest_lang = "pt"
line_count = df.shape[0]
line_lenght = len(df.iloc[0, 0])
original = np.array(list(df[src_lang].values))
target = np.array(list(df[dest_lang].values))
tentative_translation = original.copy()
new_phrases = original.copy()

indexes_to_translate = [i for i, w in enumerate(original.flatten()) if w != "<pad_token>"]
words_to_translate = [original.flatten()[i] for i in indexes_to_translate]

# translations = translator.translate(words_to_translate, src = src_lang, dest = dest_lang)
translations = translator.bulk_translate(words_to_translate, from_lang = src_lang, to = dest_lang)

In [None]:
tentative_translation = original.flatten().copy()
[tentative_translation[i] = t.text for i, t in zip(indexes_to_translate, translations)]
tentative_translation.reshape((line_count, line_lenght))

In [None]:
 try:
    # print(f"[ ] getting crosslingual rep for phrase {phrases[src_lang]}")
    original = phrases[src_lang].split() 
    target = phrases[dest_lang].split()
    new_phrase = original.copy()
    indexes_to_translate = random_generator.choice(len(original), size = int(agressiveness*len(original)), replace=False)
    words_to_translate = [original[i] for i in indexes_to_translate]
    translations = translator.translate(words_to_translate, src = src_lang, dest = dest_lang)
    # translations = translator.translate(words_to_translate, target_language = dest_lang)
    for index, translation in zip(indexes_to_translate, translations):
        if not enforce_match or translation.text in target:
            new_phrase[index] = translation.text.lower()
    # print(f"[-] done getting crosslingual rep for phrase {phrases[src_lang]}")

    # print(str.join(" ", new_phrase))
    return str.join(" ", new_phrase)
except Exception as ex:
    print(f"[+] {ex}")
    raise ex
    # return phrases[src_lang]