# Translation with LLM

## Open datasets

In [11]:
from pathlib import Path
from constants import *

# Open source
with open(Path(DATASET_DIR, 'wmt14_en_train.src'), 'r') as f:
    source_train_dataset = []
    for sentence in f:
        source_train_dataset.append([int(x) for x in sentence.split(' ')[:-1]])

# Open target
with open(Path(DATASET_DIR, 'wmt14_fr_train.trg'), 'r') as f:
    target_train_dataset = []
    for sentence in f:
        target_train_dataset.append([int(x) for x in sentence.split(' ')[:-1]])

# Open source
with open(Path(DATASET_DIR, 'wmt14_en_test.src'), 'r') as f:
    source_test_dataset = []
    for sentence in f:
        source_test_dataset.append([int(x) for x in sentence.split(' ')[:-1]])

# Open target
with open(Path(DATASET_DIR, 'wmt14_fr_test.trg'), 'r') as f:
    target_test_dataset = []
    for sentence in f:
        target_test_dataset.append([int(x) for x in sentence.split(' ')[:-1]])


## Translate

### Generate the prompt

In [43]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained(
        "gpt2",
        unk_token="<|unk|>",
        bos_token="<|bos|>",
        eos_token="<|eos|>", 
        )

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [44]:
# Generate the few-shot prompt
few_shot_prompt = "Translate the following sentence from English to French \n\n"
n_few_shot = 5
bos_token = 50257
eos_token = 50258

for i in range(n_few_shot):
    source = source_train_dataset[i]
    source = [token for token in source if (token != bos_token and token != eos_token)]
    source = tokenizer.decode(source)
    few_shot_prompt += f"English: {source} \n"

    target = target_train_dataset[i]
    target = [token for token in target if (token != bos_token and token != eos_token)]
    target = tokenizer.decode(target)
    few_shot_prompt += f"French: {target} \n\n"

In [55]:
# Generate the last step 
i = 0

source = source_test_dataset[i]
source = [token for token in source if (token != bos_token and token != eos_token)]
source = tokenizer.decode(source)

prompt = few_shot_prompt + f"English: {source} \n"
prompt += "French: "

### Make the query(s)

In [56]:
api_keys_file = "keys.txt"
with open(api_keys_file, 'r') as f:
    keys = [line.strip() for line in f.readlines()]

In [57]:
class OpernAIParams:
    model="code-davinci-002"
    max_generation_tokens=128
    temperature=0
    top_p=1
    n=1
    top_p=1
    stop='\n'
    presence_penalty=0
    best_of=10
    
open_ai_params = OpernAIParams()

In [58]:
import openai
import time

def make_query(prompt, params):

    result = None
    key_index = 0

    start_time = time.time()
    while result is None:
        try:
            key_index = (key_index + 1) % len(keys)
            result = openai.Completion.create(
                api_key=keys[key_index],
                prompt=prompt,
                model=params.model,
                max_tokens=params.max_generation_tokens,
                temperature=params.temperature,
                n=params.n,
                top_p=params.top_p,
                stop=params.stop,
                presence_penalty=params.presence_penalty,
                best_of=params.best_of
            )
        except Exception as e:
            print(e, "Retry with key index: ", key_index)
            time.sleep(5)

    elapsed_time = time.time() - start_time
    return result, elapsed_time

In [64]:
result, _ = make_query(prompt, open_ai_params)

In [66]:
predicted_target = result['choices'][0]['text']

In [67]:
predicted_target

'Spectaculaire saut en "wingsuit" au-dessus de Bogota '