# Translate with Web DeepL

We also tried to translate the original instruction dataset which is consisted of english data with Web DeepL Translator.
We utilitzed Colab to run these code.

### Download Chrome Driver

Using Selenium in Colab, it will occur PermissionError. Therefore, we need to download Chromedriver before to run the code about Selenium.

In [None]:
!pip install selenium
!pip install datasets
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver/usr/bin

### Define functions

Several funstions were defined before starting translation.
These funstions are almost same compared to the functions of `deepl_translate_api.py`.

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from datasets import load_dataset
from tqdm import tqdm
import json
import time
import sys

def dataset_load(data_type, data_path):
    dataset = []
    if data_type == "hf":
        loaded_dataset = load_dataset(data_path, split="train")
        i = 0
        for line in loaded_dataset:
            if i == 0:
                columns = list(line.keys())
                i += 1
            dataset.append(line)
    elif data_type == "json":
        with open(data_path, "r") as f:
            dataset = json.load(f)
        columns = list(dataset[0].keys())
    else:
        raise ValueError("The '--data_type' should be 'hf' or 'json'!!")
    
    return dataset, columns

def type_cls(input_data):
    if "```" in input_data:
        return "code"
    elif len(input_data.split("$")) % 2 == 1 and len(input_data.split("$")) >= 5:
        return "math"
    else:
        return None

type_list = {"math": ["$", "ABC"], "code": ["```", "BLOCKED_CODE"]}

def data_process(input_data, data_type):
    splited = input_data.split(type_list[data_type][0])
    blocked_list = []

    for i in range(len(splited)):
        if i % 2 == 1:
            blocked_list.append(type_list[data_type][0] + splited[i] + type_list[data_type][0])
            splited[i] = type_list[data_type][1]

    input_text = " ".join(splited)

    output = deepl_web(input_text)

    output = output.split()

    for i in range(len(output)):
        if output[i] == type_list[data_type][1]:
            output[i] = blocked_list[0]
            blocked_list.pop(0)

    output = " ".join(output)

    return output

deepl_info = {"input_css": "div.relative.flex-1 d-textarea", "translation_xpath": '//*[@id="headlessui-tabs-panel-7"]/div/div[1]/section/div/div[2]/div[3]/section/div[1]/d-textarea/div/p', "button_css": "#translator-source-clear-button"}

def deepl_web(input_text):
    # Get thie inupt_area
    input_area = driver.find_element(By.CSS_SELECTOR, deepl_info["input_css"])

    # Send the text
    input_area.send_keys(input_text)

    # Wait for translation to appear on the web page    
    time.sleep(3)
    
    try:
        translation_text = driver.find_element(By.XPATH, deepl_info["translation_xpath"])
        content = translation_text.text
    except Exception as e:
        button = driver.find_element(By.CSS_SELECTOR, deepl_info["button_css"])
        button.click()
        print("Retrying due to an error.")
        print(e)
        return deepl_web()
    
    # clear(X) button
    button = driver.find_element(By.CSS_SELECTOR, deepl_info["button_css"])
    button.click()

    return content

### Run translation

All done!
You can run translation with the code below.

In [None]:
# Start a Selenium driver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)

deepl_url = 'https://www.deepl.com/ko/translator'
driver.get(deepl_url)

dataset, columns = dataset_load("hf", "StudentLLM/Open-Wyvern-74k")

for i in tqdm(range(len(dataset))):
    translated_text = {}
    for column in columns:
        if column == "category":
            translated_text[column] = dataset[i][column]
            continue

        input_data = dataset[i][column]

        if input_data:
            data_type = type_cls(input_data=input_data)
        else:
            translated_text[column] = dataset[i][column]
            continue

        if data_type:
            translated_text[column] = data_process(input_data=input_data, data_type=data_type)
        else:
            translated_text[column] = deepl_web(input_text=input_data)

    with open("data_pipleine/ko-open-wyvern.json", "r") as f:
        ko_wyvern_dataset = json.load(f)

    f.close()
        
    ko_wyvern_dataset.append(translated_text)

    with open("data_pipleine/ko-open-wyvern.json", "w", encoding="utf-8") as json_file:
        json.dump(ko_wyvern_dataset, json_file, ensure_ascii=False, indent=4)

    json_file.close()

print("Translation is all done!!")

driver.close()