In [1]:
from openai import OpenAI
from dotenv import load_dotenv
from rapidfuzz import process, fuzz, utils
from typing import Dict, Literal, TypedDict
import tqdm
import subprocess

In [2]:
load_dotenv()

client = OpenAI()
MODEL = "gpt-4o-mini"


def make_openai_api_call(context: str, prompt: str):
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            { "role": "system", "content": context },
            { "role": "user", "content": prompt }
        ]
    )
    return response.choices[0].message.content

In [3]:
class TranslationExample(TypedDict):
    text: str
    line: str
    confidence: float
    translation: str

def find_n_fuzzy_matches(
        text: str, 
        source_filename: str, 
        n: int
    ) -> list[tuple[str,float, int]]:
    with open(source_filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        matches = process.extract(
            query=text, 
            choices=lines, 
            scorer=fuzz.ratio, 
            limit=n, 
            processor=utils.default_process
        )
        
        return matches

In [4]:
def read_line_n(filename: str, n: int):
    content = subprocess.run(
        args=['sed', '-n', f'{n}p', filename], 
        capture_output=True, 
        text=True
    )
    return content.stdout

In [5]:
def bind_fuzzy_matches_with_translations(
    fuzzy_matches: list[tuple[str,float, int]], 
    target_filename: str
    ) -> list[TranslationExample]:
    result = []
    for text, score, line in fuzzy_matches:
        result.append(TranslationExample({
            "text": text.strip(),
            "score": round(score, 2),
            "translation": read_line_n(
                filename=target_filename, 
                n=line + 1
            ).strip()
        }))
    return result

In [6]:
Language = Literal["polski","angielski"]
TEMP_FILENAME = "./data/temp.txt"
context = "Jesteś pomocnym bilingwalnym tłumaczem specjalizującym się w tłumaczeniach pomiędzy językiem polskim, a angielskim. Jako wynik zwracasz samo tłumaczenie."

lang_to_filename: Dict[Language, str] = {
    "angielski": "./data/train.en.txt",
    "polski" : "./data/train.pl.txt"
}

In [7]:
def translate(
    text: str,
    source_language: Language,
    target_language: Language,
    n_shots: int):
    source_filename = lang_to_filename[source_language]
    target_filename = lang_to_filename[target_language]

    matches = find_n_fuzzy_matches(
        text=text, 
        source_filename=source_filename, 
        n=n_shots
    )

    matches_with_translations = bind_fuzzy_matches_with_translations(
        fuzzy_matches=matches,
        target_filename=target_filename
    )
   
    def create_shot(match: TranslationExample):
        return f"{source_language}: {match['text']}\n" + \
               f"{target_language}: {match['translation']}"

    prompt = \
        f"Przetłumacz zdania z języka {source_language}ego " + \
        f"na język {target_language}, biorąc pod uwagę " + \
        f"przykłady tłumaczeń zdań przybliżonych.\n" + \
        "\n".join([create_shot(m) for m in matches_with_translations]) + \
        f"\n{source_language}: {text}" + \
        f"\n{target_language}: "
    translation = make_openai_api_call(context, prompt)
    
    return translation

In [8]:
def translate_batch(text: str, source_language: Language, target_language: Language, n_shots: list[int]):
    translations = []

    if max(n_shots) > 1:
        matches = find_n_fuzzy_matches(
            text=text, 
            source_filename=lang_to_filename[source_language], 
            target_filename=lang_to_filename[target_language], 
            n=max(n_shots)
        )
    
    for n in n_shots:
        if n == 0:
            prompt = f"Przetłumacz z języka {source_language}ego na język {target_language}.\n" + \
                    f"{source_language}: {text}\n" + \
                    f"{target_language}:"
            t = make_openai_api_call(context, prompt)
            translations.append(t)
        else:
            def create_shot(match: str):
                return f"{source_language}: {match.get('text')}\n" + \
                    f"{target_language}: {match.get('translation')}"

            prompt = f"Przetłumacz zdania z języka {source_language}ego" + \
                    f"na język {target_language}, biorąc pod uwagę " + \
                    f"przykłady tłumaczeń zdań podobnych.\n" + \
                    "\n".join([create_shot(m) for m in matches[:n]]) + \
                    f"\n{source_language}: {text}" + \
                    f"\n{target_language}: "
            t = make_openai_api_call(context, prompt)
            translations.append(t)
    return translations

In [9]:
val_lang_to_filename: Dict[Language, str] = {
    "angielski": './data/val.en.txt',
    "polski" : './data/val.pl.txt'
}

def ngram_translations(source_language: Language, target_langauge: Language, ngrams: list[str], target_folder: str):
    source_file = val_lang_to_filename[source_language]

    with open(source_file, "r") as file,\
         open(f"{target_folder}/00.txt", 'a') as file_res_zero,\
         open(f"{target_folder}/02.txt", 'a') as file_res_two,\
         open(f"{target_folder}/05.txt", 'a') as file_res_five,\
         open(f"{target_folder}/10.txt", 'a') as file_res_ten:
        
        for sentence in tqdm(file):
            sentence = sentence.strip()
            translations = translate_batch(sentence, source_language, target_langauge, ngrams)

            file_res_zero.write(str(translations[0]).strip() + '\n')
            file_res_two.write(str(translations[1]).strip() + '\n')
            file_res_five.write(str(translations[2]).strip() + '\n')
            file_res_ten.write(str(translations[3]).strip() + '\n')

In [19]:
ngram_translations('angielski', 'polski', [0, 2, 5, 10], './data/en-pl')

1000it [3:59:50, 14.39s/it]
