In [1]:
import os
import re
import html
import time
import random
import json
import requests
import math

from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from googletrans import Translator

In [2]:
dataset_dir = "../datasets"
kalapa_dataset_dir = os.path.join(dataset_dir, "KALAPA_ByteBattles_2023_MEDICAL_Set1")
processed_save_dir = os.path.join(kalapa_dataset_dir, "translated")
subsection_save_dir = os.path.join(kalapa_dataset_dir, "translated_subsections")

os.makedirs(subsection_save_dir, exist_ok=True)

In [3]:
# def name_formatter(text):
#     # format the text so that we can use it as filename by replacing all special characters with _
#     text = text.lower()
#     text = re.sub(r"\s+", "_", text)
#     text = re.sub(r"[^a-zA-Z0-9_]", "", text)
#     return text
from pyvi import ViUtils
ViUtils.remove_accents(u"Trường đại học bách khoa hà nội")

def get_document_name_with_accent(document):
    name = document["name"]
    title = document["title"]
    title = title.replace("(", "").replace(")", "")
    
    name_no_accent = name.replace("-", " ").lower()
    title_no_accent = ViUtils.remove_accents(title).decode('ASCII').lower()
    # print(name_no_accent)
    # print(title_no_accent)
    
    # find start index of title in name
    start_index = title_no_accent.find(name_no_accent)
    if start_index == -1:
        print(f"Cannot find title {title} in name {name}")
        return name_no_accent
    else:
        return title[start_index:start_index+len(name_no_accent)]

def get_subsection_sample_from_document(document):
    subsec_2_save = []

    name = document["name"]
    title = document["title"]
    abstract = document["abstract"]
    category = document["category"]
    subsections = document["subsections"]
    document_name_with_accent = get_document_name_with_accent(document)
    for i, subsection in enumerate(subsections):
        subsection_name = subsection['subsection_name']
        subsection_content = subsection['subsection_content']
        
        subsection_title = document_name_with_accent + ". " + subsection_name
        subsection_data = subsection_title + "\n" + subsection_content
        
        subsec = {
            "document_name": name,
            "document_name_accent": document_name_with_accent,
            "document_title": title,
            "document_category": category,
            "subsection_name": subsection_name,
            "subsection_content": subsection_content,
            
            "subsection_name": f"{name}_{i}_{subsection_name}",
            "subsection_title": subsection_title,
            "subsection_data": subsection_data,
        }
        # print(json.dumps(subsec, indent=4, ensure_ascii=False))
        subsec_2_save.append(subsec)
    
    return subsec_2_save

In [4]:
filenames = os.listdir(processed_save_dir)
filenames = sorted(filenames)
num_subsections = 0

for i, filename in enumerate(filenames):
    processed_filepath = os.path.join(processed_save_dir, filename)
    with open(
        processed_filepath, "r", encoding="utf-8"
    ) as f:
        document = json.load(f)
    
    subsec_2_save = get_subsection_sample_from_document(document)
    for i, subsection in enumerate(subsec_2_save):
        subsection_save_path = os.path.join(subsection_save_dir, f"{subsection['document_name'].split('.')[0]}_{i}.json")
        with open(subsection_save_path, "w", encoding="utf-8") as f:
            json.dump(subsection, f, indent=4, ensure_ascii=False)
    num_subsections += len(subsec_2_save)
print(f"num_subsections: {num_subsections}")

Cannot find title Alzheimer's disease: Causes, symptoms, treatment and prevention in name alzheimer.json
Cannot find title What disease is indigestion? Causes, symptoms and prevention in name an-khong-tieu.json
Cannot find title Anal abscess: Causes, symptoms and prevention in name ap-xe-hau-mon.json
Cannot find title Lung abscess: Causes, symptoms and treatments in name ap-xe-phoi.json
Cannot find title Diabetic feet: Causes, symptoms and diagnosis in name ban-chan-dai-thao-duong.json
Cannot find title Postpartum hemorrhage: Causes, symptoms and prevention in name bang-huyet-sau-sinh.json
Cannot find title Overactive bladder: Causes, symptoms, diagnosis and treatment in name bang-quang-tang-hoat.json
Cannot find title Postpartum dyshidrosis: Causes, signs, treatment and prevention in name be-san-dich.json
Cannot find title Diphtheria: unpredictable complications and effective prevention in name benh-bach-hau.json
Cannot find title Basedow's disease: Causes, causes and diagnosis in nam