In [1]:
import os
import re
import html
import time
import random
import json
import requests
import math

from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from googletrans import Translator

In [2]:
def translate_text(texts, dest="en", batch_size=16, delay_per_batch=2):
    translator = Translator()
    is_str = False
    if isinstance(texts, str):
        texts = [texts]
        is_str = True

    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i : i + batch_size]
        tries = 5
        while tries > 0:
            try:
                results.extend(translator.translate(batch, dest=dest))
                time.sleep(random.randint(0, delay_per_batch))
                break
            except Exception as e:
                tries -= 1
                print(e)
                print(f"Retrying batch {i//batch_size}/{math.ceil(len(texts)/batch_size)}")
                time.sleep(random.randint(0, delay_per_batch))

    results = [html.unescape(r.text) for r in results]
    if is_str:
        return results[0]
    else:
        return results

In [3]:
all_categories = ['Vú - Nhũ', 'Nhi - Sơ sinh', 'Hỗ trợ sinh sản IVF', 'Ung bướu', 'Tâm lý', 'Tiết niệu', 'Cơ xương khớp', 'Nội tiết', 'Tim mạch', 'Mắt', 'COVID-19', 'Da liễu', 'Tiêu hóa - Gan mật', 'Truyền nhiễm', 'Chẩn đoán hình ảnh', 'Sức khỏe tổng quát', 'Sản - Phụ khoa', 'Thần kinh', 'Hô hấp', 'Tai mũi họng', None, 'Xét nghiệm', 'Nam Học']
all_en_categories = ['Breast - Breast', 'Pediatrics - Newborn', 'IVF Reproductive Support', 'Oncology', 'Psychological', 'Urology', 'Musculoskeletal', 'Endocrine', 'Heart vascular', 'Eye', 'COVID-19', 'Dermatology', 'Gastrointestinal - Hepatobiliary', 'Infectious', 'Imaging', 'General health', 'Obstetrics - Gynecology ', 'Neurological', 'Respiratory', 'Ear, nose and throat', None, 'Testing', 'Andrology']
assert len(all_categories) == len(all_en_categories)

en_category_to_vi_category = dict(zip(all_en_categories, all_categories))
vi_category_to_en_category = dict(zip(all_categories, all_en_categories))

print(json.dumps(en_category_to_vi_category, indent=4, ensure_ascii=False))

{
    "Breast - Breast": "Vú - Nhũ",
    "Pediatrics - Newborn": "Nhi - Sơ sinh",
    "IVF Reproductive Support": "Hỗ trợ sinh sản IVF",
    "Oncology": "Ung bướu",
    "Psychological": "Tâm lý",
    "Urology": "Tiết niệu",
    "Musculoskeletal": "Cơ xương khớp",
    "Endocrine": "Nội tiết",
    "Heart vascular": "Tim mạch",
    "Eye": "Mắt",
    "COVID-19": "COVID-19",
    "Dermatology": "Da liễu",
    "Gastrointestinal - Hepatobiliary": "Tiêu hóa - Gan mật",
    "Infectious": "Truyền nhiễm",
    "Imaging": "Chẩn đoán hình ảnh",
    "General health": "Sức khỏe tổng quát",
    "Obstetrics - Gynecology ": "Sản - Phụ khoa",
    "Neurological": "Thần kinh",
    "Respiratory": "Hô hấp",
    "Ear, nose and throat": "Tai mũi họng",
    "null": null,
    "Testing": "Xét nghiệm",
    "Andrology": "Nam Học"
}


## Playground

In [4]:
dataset_dir = "../datasets"
kalapa_dataset_dir = os.path.join(dataset_dir, "KALAPA_ByteBattles_2023_MEDICAL_Set1")
processed_save_dir = os.path.join(kalapa_dataset_dir, "processed")
translated_save_dir = os.path.join(kalapa_dataset_dir, "translated")

os.makedirs(translated_save_dir, exist_ok=True)

In [5]:
def translate_document(document: dict):
    title = document["title"]
    abstract = document["abstract"]
    subsections = document["subsections"]
    
    translated_title = translate_text(title)
    translated_abstract = translate_text(abstract)
    translated_subsections = []
    
    for subsection in subsections:
        subsection_name = subsection["subsection_name"]
        subsection_content = subsection["subsection_content"]
        
        translated_subsection_name, translated_subsection_content = translate_text([subsection_name, subsection_content])
        subsection_string = translated_subsection_name + "\n" + translated_subsection_content
        
        translated_subsection = {
            "subsection_name": translated_subsection_name,
            "subsection_content": translated_subsection_content,
            "subsection_string": subsection_string
        }
        translated_subsections.append(translated_subsection)
    
    translated_content = [
        translated_title,
        translated_abstract,
        *[subsection["subsection_string"] for subsection in translated_subsections]
    ]
    translated_content = "\n\n".join(translated_content)
    
    return {
        "title": translated_title,
        "category": vi_category_to_en_category[document["category"]],
        "link": document["link"],
        "abstract": translated_abstract,
        "content": translated_content,
        "subsections": translated_subsections
    }

In [6]:
document_names = sorted(os.listdir(processed_save_dir))
error_indices = []
for i in tqdm(range(0, len(document_names)), desc="Processing documents"):
    try:
        document_name = document_names[i]
        document_path = os.path.join(processed_save_dir, document_name)
        with open(document_path, "r", encoding="utf-8") as f:
            document = json.load(f)
        translated_document = translate_document(document)
        translated_document["name"] = document_name

        translated_save_path = os.path.join(translated_save_dir, document_name + ".json")
        with open(translated_save_path, "w") as f:
            json.dump(translated_document, f, indent=4, ensure_ascii=False)
    except Exception as e:
        print(e)
        print(f"Error at index {i} with document name {document_name}")
        error_indices.append(i)
        continue

Processing documents:   0%|          | 0/603 [00:00<?, ?it/s]