In [2]:
from openai import OpenAI
from dotenv import load_dotenv
from rapidfuzz import process, fuzz, utils
from typing import Dict, Literal, TypedDict
from tqdm import tqdm
from comet import download_model, load_from_checkpoint
import subprocess
import sacrebleu
import evaluate

  from .autonotebook import tqdm as notebook_tqdm
PyTorch version 2.4.1 available.


In [3]:
load_dotenv()

client = OpenAI()
MODEL = "gpt-4o-mini"


def make_openai_api_call(context: str, prompt: str):
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            { "role": "system", "content": context },
            { "role": "user", "content": prompt }
        ]
    )
    return response.choices[0].message.content

In [4]:
class TranslationExample(TypedDict):
    text: str
    line: str
    confidence: float
    translation: str

def find_n_fuzzy_matches(
        text: str, 
        source_filename: str, 
        n: int
    ) -> list[tuple[str,float, int]]:
    with open(source_filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        matches = process.extract(
            query=text, 
            choices=lines, 
            scorer=fuzz.ratio, 
            limit=n, 
            processor=utils.default_process
        )
        
        return matches

In [5]:
def read_line_n(filename: str, n: int):
    content = subprocess.run(
        args=['sed', '-n', f'{n}p', filename], 
        capture_output=True, 
        text=True
    )
    return content.stdout

In [6]:
def bind_fuzzy_matches_with_translations(
    fuzzy_matches: list[tuple[str,float, int]], 
    target_filename: str
    ) -> list[TranslationExample]:
    result = []
    for text, score, line in fuzzy_matches:
        result.append(TranslationExample({
            "text": text.strip(),
            "score": round(score, 2),
            "translation": read_line_n(
                filename=target_filename, 
                n=line + 1
            ).strip()
        }))
    return result

In [7]:
Language = Literal["polski","angielski"]
TEMP_FILENAME = "./data/temp.txt"
context = "Jesteś pomocnym bilingwalnym tłumaczem specjalizującym się w tłumaczeniach pomiędzy językiem polskim, a angielskim. Jako wynik zwracasz samo tłumaczenie."

lang_to_filename: Dict[Language, str] = {
    "angielski": "./data/train.en.txt",
    "polski" : "./data/train.pl.txt"
}

In [9]:
def translate(
    text: str,
    source_language: Language,
    target_language: Language,
    n_shots: int):
    source_filename = lang_to_filename[source_language]
    target_filename = lang_to_filename[target_language]

    matches = find_n_fuzzy_matches(
        text=text, 
        source_filename=source_filename, 
        n=n_shots
    )

    matches_with_translations = bind_fuzzy_matches_with_translations(
        fuzzy_matches=matches,
        target_filename=target_filename
    )
   
    def create_shot(match: TranslationExample):
        return f"{source_language}: {match['text']}\n" + \
               f"{target_language}: {match['translation']}"

    prompt = \
        f"Przetłumacz zdania z języka {source_language}ego " + \
        f"na język {target_language}, biorąc pod uwagę " + \
        f"przykłady tłumaczeń zdań przybliżonych.\n" + \
        "\n".join([create_shot(m) for m in matches_with_translations]) + \
        f"\n{source_language}: {text}" + \
        f"\n{target_language}: "
    translation = make_openai_api_call(context, prompt)
    
    return translation

In [10]:
def translate_batch(text: str, source_language: Language, target_language: Language, n_shots: list[int]):
    translations = []

    if max(n_shots) > 1:
        matches = find_n_fuzzy_matches(
            text=text, 
            source_filename=lang_to_filename[source_language], 
            target_filename=lang_to_filename[target_language], 
            n=max(n_shots)
        )
    
    for n in n_shots:
        if n == 0:
            prompt = f"Przetłumacz z języka {source_language}ego na język {target_language}.\n" + \
                    f"{source_language}: {text}\n" + \
                    f"{target_language}:"
            t = make_openai_api_call(context, prompt)
            translations.append(t)
        else:
            def create_shot(match: str):
                return f"{source_language}: {match.get('text')}\n" + \
                    f"{target_language}: {match.get('translation')}"

            prompt = f"Przetłumacz zdania z języka {source_language}ego" + \
                    f"na język {target_language}, biorąc pod uwagę " + \
                    f"przykłady tłumaczeń zdań podobnych.\n" + \
                    "\n".join([create_shot(m) for m in matches[:n]]) + \
                    f"\n{source_language}: {text}" + \
                    f"\n{target_language}: "
            t = make_openai_api_call(context, prompt)
            translations.append(t)
    return translations

In [11]:
val_lang_to_filename: Dict[Language, str] = {
    "angielski": './data/val.en.txt',
    "polski" : './data/val.pl.txt'
}

def ngram_translations(source_language: Language, target_langauge: Language, ngrams: list[str], target_folder: str):
    source_file = val_lang_to_filename[source_language]

    with open(source_file, "r") as file,\
         open(f"{target_folder}/00.txt", 'a') as file_res_zero,\
         open(f"{target_folder}/02.txt", 'a') as file_res_two,\
         open(f"{target_folder}/05.txt", 'a') as file_res_five,\
         open(f"{target_folder}/10.txt", 'a') as file_res_ten:
        
        for sentence in tqdm(file):
            sentence = sentence.strip()
            translations = translate_batch(sentence, source_language, target_langauge, ngrams)

            file_res_zero.write(str(translations[0]).strip() + '\n')
            file_res_two.write(str(translations[1]).strip() + '\n')
            file_res_five.write(str(translations[2]).strip() + '\n')
            file_res_ten.write(str(translations[3]).strip() + '\n')

In [19]:
ngram_translations('angielski', 'polski', [0, 2, 5, 10], './data/en-pl')

1000it [3:59:50, 14.39s/it]


# Evaluation

### BLEU Metric

In [11]:
def evaluate_bleu(sys_filename: str, refs_filename: str):
    with open(sys_filename, 'r') as sys_file, \
         open(refs_filename, 'r') as refs_file:
        sys = sys_file.readlines()
        refs = refs_file.readlines()
        
        bleu = sacrebleu.BLEU()
        score = bleu.corpus_score(sys, [refs])
        
        # print(bleu.get_signature())
        return score

In [12]:
# From EN to PL
print(evaluate_bleu(sys_filename='./data/en-pl/00.txt', refs_filename='./data/val.pl.txt'))
print(evaluate_bleu(sys_filename='./data/en-pl/02.txt', refs_filename='./data/val.pl.txt'))
print(evaluate_bleu(sys_filename='./data/en-pl/05.txt', refs_filename='./data/val.pl.txt'))
print(evaluate_bleu(sys_filename='./data/en-pl/10.txt', refs_filename='./data/val.pl.txt'))

BLEU = 23.82 47.6/28.1/19.5/14.0 (BP = 0.970 ratio = 0.970 hyp_len = 19877 ref_len = 20490)
BLEU = 41.14 57.9/43.7/36.7/31.7 (BP = 0.993 ratio = 0.993 hyp_len = 20345 ref_len = 20490)
BLEU = 42.98 58.9/45.3/38.4/33.4 (BP = 1.000 ratio = 1.000 hyp_len = 20488 ref_len = 20490)
BLEU = 43.17 59.4/45.8/38.9/34.0 (BP = 0.991 ratio = 0.991 hyp_len = 20305 ref_len = 20490)


In [13]:
# From PL to EN
print(evaluate_bleu(sys_filename='./data/pl-en/00.txt', refs_filename='./data/val.en.txt'))
print(evaluate_bleu(sys_filename='./data/pl-en/02.txt', refs_filename='./data/val.en.txt'))
print(evaluate_bleu(sys_filename='./data/pl-en/05.txt', refs_filename='./data/val.en.txt'))
print(evaluate_bleu(sys_filename='./data/pl-en/10.txt', refs_filename='./data/val.en.txt'))

BLEU = 29.08 54.1/32.9/23.2/17.3 (BP = 1.000 ratio = 1.025 hyp_len = 22949 ref_len = 22386)
BLEU = 47.06 63.8/49.3/42.1/37.1 (BP = 1.000 ratio = 1.005 hyp_len = 22488 ref_len = 22386)
BLEU = 48.28 64.6/50.5/43.4/38.3 (BP = 1.000 ratio = 1.005 hyp_len = 22503 ref_len = 22386)
BLEU = 49.27 65.1/51.4/44.5/39.6 (BP = 1.000 ratio = 1.001 hyp_len = 22407 ref_len = 22386)


### ChfR Metric

In [14]:
def evaluate_chfr(sys_filename: str, refs_filename: str):
    with open(sys_filename, 'r') as sys_file, \
         open(refs_filename, 'r') as refs_file:
        sys = sys_file.readlines()
        refs = refs_file.readlines()
        
        chrf = sacrebleu.CHRF()
        score = chrf.corpus_score(sys, [refs])

        # print(chrf.get_signature())
        return score

In [15]:
# From EN to PL
print(evaluate_chfr(sys_filename='./data/en-pl/00.txt', refs_filename='./data/val.pl.txt'))
print(evaluate_chfr(sys_filename='./data/en-pl/02.txt', refs_filename='./data/val.pl.txt'))
print(evaluate_chfr(sys_filename='./data/en-pl/05.txt', refs_filename='./data/val.pl.txt'))
print(evaluate_chfr(sys_filename='./data/en-pl/10.txt', refs_filename='./data/val.pl.txt'))

chrF2 = 47.50
chrF2 = 58.54
chrF2 = 59.60
chrF2 = 59.79


In [16]:
# From PL to EN
print(evaluate_chfr(sys_filename='./data/pl-en/00.txt', refs_filename='./data/val.en.txt'))
print(evaluate_chfr(sys_filename='./data/pl-en/02.txt', refs_filename='./data/val.en.txt'))
print(evaluate_chfr(sys_filename='./data/pl-en/05.txt', refs_filename='./data/val.en.txt'))
print(evaluate_chfr(sys_filename='./data/pl-en/10.txt', refs_filename='./data/val.en.txt'))

chrF2 = 52.60
chrF2 = 62.75
chrF2 = 63.78
chrF2 = 64.13


### TER Metric

In [17]:
def evaluate_ter(sys_filename: str, refs_filename: str):
    with open(sys_filename, 'r') as sys_file, \
         open(refs_filename, 'r') as refs_file:
        sys = sys_file.readlines()
        refs = refs_file.readlines()

        ter = sacrebleu.TER()
        score = ter.corpus_score(sys, [refs])

        # print(ter.get_signature())
        return score

In [18]:
# From EN to PL
print(evaluate_ter(sys_filename='./data/en-pl/00.txt', refs_filename='./data/val.pl.txt'))
print(evaluate_ter(sys_filename='./data/en-pl/02.txt', refs_filename='./data/val.pl.txt'))
print(evaluate_ter(sys_filename='./data/en-pl/05.txt', refs_filename='./data/val.pl.txt'))
print(evaluate_ter(sys_filename='./data/en-pl/10.txt', refs_filename='./data/val.pl.txt'))

TER = 80.44
TER = 65.48
TER = 64.38
TER = 63.64


In [19]:
# From PL to EN
print(evaluate_ter(sys_filename='./data/pl-en/00.txt', refs_filename='./data/val.en.txt'))
print(evaluate_ter(sys_filename='./data/pl-en/02.txt', refs_filename='./data/val.en.txt'))
print(evaluate_ter(sys_filename='./data/pl-en/05.txt', refs_filename='./data/val.en.txt'))
print(evaluate_ter(sys_filename='./data/pl-en/10.txt', refs_filename='./data/val.en.txt'))

TER = 74.86
TER = 59.69
TER = 57.99
TER = 57.02


### METEOR Metric

In [20]:
meteor = evaluate.load("meteor")

def evaluate_meteor(sys_filename: str, refs_filename: str):
    with open(sys_filename, 'r') as sys_file, \
         open(refs_filename, 'r') as refs_file:
        sys = sys_file.readlines()
        refs = refs_file.readlines()

    results = meteor.compute(predictions=[s.strip() for s in sys], references=[r.strip() for r in refs])
    return results['meteor']

[nltk_data] Downloading package wordnet to /home/piotr/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/piotr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/piotr/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [21]:
# From EN to PL
print(evaluate_meteor(sys_filename='./data/en-pl/00.txt', refs_filename='./data/val.pl.txt'))
print(evaluate_meteor(sys_filename='./data/en-pl/02.txt', refs_filename='./data/val.pl.txt'))
print(evaluate_meteor(sys_filename='./data/en-pl/05.txt', refs_filename='./data/val.pl.txt'))
print(evaluate_meteor(sys_filename='./data/en-pl/10.txt', refs_filename='./data/val.pl.txt'))

0.35951175197195473
0.49457376397913616
0.5044096983347446
0.5046950829450687


In [22]:
# From PL to EN
print(evaluate_meteor(sys_filename='./data/pl-en/00.txt', refs_filename='./data/val.en.txt'))
print(evaluate_meteor(sys_filename='./data/pl-en/02.txt', refs_filename='./data/val.en.txt'))
print(evaluate_meteor(sys_filename='./data/pl-en/05.txt', refs_filename='./data/val.en.txt'))
print(evaluate_meteor(sys_filename='./data/pl-en/10.txt', refs_filename='./data/val.en.txt'))

0.46228385555508894
0.5632063254041779
0.5735138695453181
0.5763032255524402


### COMET Metric

In [None]:
model_path = download_model("Unbabel/wmt22-comet-da")
model = load_from_checkpoint(model_path)

In [56]:
def evaluate_comet(src_filename: str, sys_filename: str, refs_filename: str):
    with open(src_filename, 'r') as src_file, \
         open(sys_filename, 'r') as sys_file, \
         open(refs_filename, 'r') as refs_file:
        src = src_file.readlines()
        sys = sys_file.readlines()
        refs = refs_file.readlines()
        data = [
            {"src": src, "mt": mt, "ref": ref}
            for src, mt, ref in zip(src, sys, refs)
        ]

        predictions = model.predict(data, batch_size=8, gpus=0)
        return sum(predictions["scores"]) / len(predictions["scores"])

In [None]:
# From EN to PL
print(evaluate_comet(src_filename='./data/val.en.txt', refs_filename='./data/val.pl.txt', sys_filename='./data/en-pl/00.txt'))
print(evaluate_comet(src_filename='./data/val.en.txt', refs_filename='./data/val.pl.txt', sys_filename='./data/en-pl/02.txt'))
print(evaluate_comet(src_filename='./data/val.en.txt', refs_filename='./data/val.pl.txt', sys_filename='./data/en-pl/05.txt'))
print(evaluate_comet(src_filename='./data/val.en.txt', refs_filename='./data/val.pl.txt', sys_filename='./data/en-pl/10.txt'))

In [None]:
# From PL to EN
print(evaluate_comet(src_filename='./data/val.pl.txt', refs_filename='./data/val.en.txt', sys_filename='./data/pl-en/00.txt'))
print(evaluate_comet(src_filename='./data/val.pl.txt', refs_filename='./data/val.en.txt', sys_filename='./data/pl-en/02.txt'))
print(evaluate_comet(src_filename='./data/val.pl.txt', refs_filename='./data/val.en.txt', sys_filename='./data/pl-en/05.txt'))
print(evaluate_comet(src_filename='./data/val.pl.txt', refs_filename='./data/val.en.txt', sys_filename='./data/pl-en/10.txt'))