In [None]:
# Import libraries
import os
import json
import time
from transformers import MarianMTModel, MarianTokenizer
import torch
import shutil
from google.colab import drive

In [None]:
!pip install sacremoses

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model_name = "Helsinki-NLP/opus-mt-pl-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)
model.to(device)

In [None]:
def get_shorter_json_file(subfolder_path):
    json_files = [file for file in os.listdir(subfolder_path) if file.endswith('.json')]
    if not json_files:
        return None
    # Sort files by length of their names and return the shortest one
    json_files.sort(key=len)
    return json_files[0]

def translate_text(text):
    """
    Translate a given text from Polish to English.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True).to(device) #, padding=True
    #print(len(inputs))
    #print(inputs)
    with torch.no_grad():
      outputs = model.generate(**inputs, early_stopping=True)#, num_beams=5, repetition_penalty=1.2)
    #print(len(outputs))
    #print(outputs)
    translated_text = [tokenizer.decode(t, skip_special_tokens=True) for t in outputs]
    return translated_text[0]

def translate_folder(folder_path):
    # Start processing files
    start_time = time.time()

    # List all JSON files in the input folder

    folder_names = os.listdir(folder_path)
    idx = 0
    for subfolder in folder_names:
        idx += 1
        subfolder_path = os.path.join(folder_path, subfolder)
        if os.path.isdir(subfolder_path):
            # Get the JSON file with the shorter name
            shorter_json_file = get_shorter_json_file(subfolder_path)
            if shorter_json_file:
                file_path = os.path.join(subfolder_path, shorter_json_file)
                output_path = os.path.join(subfolder_path, f"{subfolder}_translated.json")

                # Read the input JSON file
                with open(file_path, "r", encoding="utf-8") as file:
                    data = json.load(file)

                # Count the number of messages
                num_messages = len(data.get("messages", []))
                print(f"({idx}/{len(folder_names)}) Processing file: {shorter_json_file} with {num_messages} messages")

                # Translate messages
                for message in data.get("messages", []):
                    if "content" in message:
                        original_text = message["content"]
                        translated_text = translate_text(original_text)
                        message["content_translated"] = translated_text

                # Save the translated messages to a new JSON file
                with open(output_path, "w", encoding="utf-8") as outfile:
                    json.dump(data, outfile, indent=4, ensure_ascii=True)

                print(f"Translated file saved as '{subfolder}_translated.json'")

            # Calculate total execution time
            end_time = time.time()
            execution_time = end_time - start_time

            print(f"\nTranslation of all files completed in {execution_time:.2f} seconds.")

In [None]:
def translate_folder_drive(source_folder, destination_folder):
  # Check if the source folder exists
  if os.path.exists(source_folder):
    shutil.copytree(source_folder, "/content/for_translation")
  else:
    print(f"Source folder '{source_folder}' not found in your Google Drive.")

  translate_folder("/content/for_translation")
  shutil.copytree("/content/for_translation", destination_folder)

In [None]:
source_folder = ''
destination_folder = ''
translate_folder_drive(source_folder, destination_folder)