In [None]:
import os
import itertools

In [None]:
from selenium import webdriver
from selenium.common.exceptions import *

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

from urllib import parse

import bs4

---

In [None]:
BASE_URL = 'https://translate.google.cz/'
DATA_DIR = 'data/'

In [None]:
def load_data(prefix='src', mode='train'):
    data_fname = f"{prefix}-{mode}.txt"
    data_fpath = os.path.join(DATA_DIR, data_fname)
    
    with open(data_fpath, 'r') as f:
        data = [
            s.strip() for s in f.readlines()
        ]
    
    return data

In [None]:
CHARACTER_LIMIT = 4500

def iter_chunks(data: list):
    remaining_limit = CHARACTER_LIMIT
    chunk = []
    
    for line in data:
        chunk.append(line)
        remaining_limit -= len(line)
        
        if remaining_limit <= 0:
            yield "\n".join(chunk)
            
            chunk = []
            remaining_limit = CHARACTER_LIMIT
            
    yield "\n".join(chunk)

In [None]:
def translate(query: str, from_lang='cs', to_lang='en') -> list:
    lang_mod = f"#{from_lang}/{to_lang}/"
    quoted_query = lang_mod + parse.quote(query)
    
    url = parse.urljoin(BASE_URL, quoted_query)
    driver.get(url)
    
    translation = driver.find_element_by_id('result_box')
    
    return translation.text.split(sep='\n')

In [None]:
def swap_languages():
    WebDriverWait(driver, timeout=3).until(
        EC.element_to_be_clickable((By.ID, 'gt-swap'))
    )
    
    swap_button = driver.find_element_by_id('gt-swap')
    swap_button.click()

---

In [None]:
driver = webdriver.Chrome()
driver.get(BASE_URL)

driver.implicitly_wait(0.25)

---

In [None]:
prefix = 'src'

for mode in ['train', 'test', 'val']
    data = load_data(prefix, mode)
    
    it = iter_chunks(data)
    
    tasks = itertools.chain(
        translate(chunk, from_lang='en', to_lang='cs')
        for chunk in it
    )
    
    with open(f'data/{prefix}-{mode}-cz.txt', 'w+') as out:

        for translation in tasks:
            out.write('\n'.join(translation))

---