In [3]:
import json
from openai import OpenAI
import ast
from datasets import load_dataset
import random
import pandas as pd
from tqdm import tqdm

In [2]:
api_key = ""
client = OpenAI(api_key=api_key)

In [1]:
# dataset_ru = load_dataset("s-nlp/ru_paradetox")
# dataset_eng = load_dataset("s-nlp/paradetox")
# ru_samples = random.sample(list(dataset_ru["train"]), 140)
# en_samples = random.sample(list(dataset_eng["train"]), 210)
# df_ru = pd.DataFrame.from_dict(ru_samples)
# df_en = pd.DataFrame.from_dict(en_samples)
# df_ru.to_csv("ru_samples.tsv", sep="\t")
# df_en.to_csv("en_samples.tsv", sep="\t")

In [3]:
prompt_file = open("translation_prompt.json")
data = json.load(prompt_file)
prompt_file.close()

In [32]:
target_languages = [
    "Ukrainian",
    "German",
    "Spanish",
    "Chinese",
    "Amharic",
    "Arabic",
    "Hindi"
]

In [4]:
df_ru = pd.read_csv("ru_samples.tsv", sep="\t", index_col=0)
df_en = pd.read_csv("en_samples.tsv", sep="\t", index_col=0)
ru_samples = list(df_ru.T.to_dict().values())
en_samples = list(df_en.T.to_dict().values())

In [9]:
def get_completion(prompt, model="gpt-4"):
    messages = [{"role": "system", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.4
    )
    return response.choices[0].message.content

In [23]:
def translate_from_eng(target_language, examples):
    prompt = data["prompt"]
    prompt = prompt.replace("SOURCE_LANGUAGE", "English")
    prompt = prompt.replace("TARGET_LANGUAGE", target_language)
    examples = [f"TOXIC: {example['en_toxic_comment']} NON-TOXIC: {example['en_neutral_comment']}" for example in examples]
    examples = "\n".join(examples)
    prompt = prompt.replace("EXAMPLES", examples)
    response = get_completion(prompt)
    try:
        response = ast.literal_eval(response)
    except:
        response = response.replace('"', '\\"')
        response = response.replace("('", '("')
        response = response.replace("', '", '", "')
        response = response.replace("')", '")')
        try:
            response = ast.literal(response)
        except:
            print(response)
    return response

In [26]:
counter_en = 0
for target_language in tqdm(target_languages):
    translated_samples = []
    for i in tqdm(range(3)):
        print(f"Translating to {target_language}")
        translation = translate_from_eng(target_language, en_samples[counter_en:counter_en+10])
        counter_en += 10
        translated_samples.extend(translation)
    
    translated_dict = [{"toxic": sample[0], "neutral": sample[1]} for sample in translated_samples]
    translated_df = pd.DataFrame.from_dict(translated_dict)
    translated_df.to_csv(f"translated_data/en_to_{target_language}.tsv", sep="\t")

  0%|          | 0/7 [00:00<?, ?it/s]

Translating to Ukrainian




Translating to Ukrainian




Translating to Ukrainian


100%|██████████| 3/3 [01:08<00:00, 22.74s/it]
 14%|█▍        | 1/7 [01:08<06:49, 68.27s/it]

Translating to German




Translating to German




Translating to German


100%|██████████| 3/3 [00:39<00:00, 13.20s/it]
 29%|██▊       | 2/7 [01:47<04:17, 51.41s/it]

Translating to Spanish




Translating to Spanish




Translating to Spanish


100%|██████████| 3/3 [00:30<00:00, 10.27s/it]
 43%|████▎     | 3/7 [02:18<02:48, 42.00s/it]

Translating to Chinese




Translating to Chinese




Translating to Chinese


100%|██████████| 3/3 [00:43<00:00, 14.61s/it]
 57%|█████▋    | 4/7 [03:02<02:08, 42.73s/it]

Translating to Amharic




Translating to Amharic




Translating to Amharic


100%|██████████| 3/3 [02:53<00:00, 57.87s/it]
 71%|███████▏  | 5/7 [05:56<02:59, 89.94s/it]

Translating to Arabic




Translating to Arabic




Translating to Arabic


100%|██████████| 3/3 [01:06<00:00, 22.07s/it]
 86%|████████▌ | 6/7 [07:02<01:21, 81.88s/it]

Translating to Hindi




Translating to Hindi




Translating to Hindi


100%|██████████| 3/3 [05:13<00:00, 104.56s/it]
100%|██████████| 7/7 [12:16<00:00, 105.16s/it]


In [34]:
def translate_from_ru(target_language, examples):
    prompt = data["prompt"]
    prompt = prompt.replace("SOURCE_LANGUAGE", "Russian")
    prompt = prompt.replace("TARGET_LANGUAGE", target_language)
    examples = [f"TOXIC: {example['ru_toxic_comment']} NON-TOXIC: {example['ru_neutral_comment']}" for example in examples]
    examples = "\n".join(examples)
    prompt = prompt.replace("EXAMPLES", examples)
    response = get_completion(prompt)
    try:
        response = ast.literal_eval(response)
    except:
        response = response.replace('"', '\\"')
        response = response.replace("('", '("')
        response = response.replace("', '", '", "')
        response = response.replace("')", '")')
        response = response.replace("Here are the translations:\n\n", "")
        response = response.replace("Sure, here are the translations:\n\n", "")
        try:
            response = ast.literal(response)
        except:
            print(response)
    return response

In [6]:
counter_ru = 20
for target_language in tqdm(target_languages):
    translated_samples = []
    for i in tqdm(range(2)):
        print(f"Translating to {target_language}")
        translation = translate_from_ru(target_language, ru_samples[counter_ru:counter_ru+10])
        counter_ru += 10
        translated_samples.extend(translation)
    
    translated_dict = [{"toxic": sample[0], "neutral": sample[1]} for sample in translated_samples]
    translated_df = pd.DataFrame.from_dict(translated_dict)
    translated_df.to_csv(f"reanslated_data/ru_to_{target_language}.tsv", sep="\t")