In [None]:
import json
import os
import re
from pathlib import Path

import requests
from bs4 import BeautifulSoup
from docx import Document
from tqdm import tqdm
import pinyin

### Load Data

In [23]:
def get_files_name(folder_path: str = "../data/raw") -> list:
    return [
        f
        for f in os.listdir(folder_path)
        if os.path.isfile(os.path.join(folder_path, f)) and f.endswith(".docx")
    ]
    
def read_docx(file_path: str) -> str:
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return " ".join(full_text)

def save_progress(vocab, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(vocab, f, indent=4, ensure_ascii=False)
    print(f"Progress saved to {path}")

### Process the Data

In [None]:
def extract_vocab(text: str, vocab: dict) -> dict:
    for word in text:
        word = "".join(e for e in word if e.isalnum())
        word.replace(" ", "")
        if word not in vocab and 0 < len(word) < 12 and "2024" not in word:
            vocab[word] = {}
    return vocab

def vocab_list(vocab: str, file_path: str, folder_path: str = "../data/raw") -> dict:
    text = read_docx(os.path.join(folder_path, file_path))
    extracted_vocab = extract_vocab(text.split(" "), vocab)
    return extracted_vocab

def build_vocab_list(vocab: dict = None, folder_path: str = "data/raw") -> dict:
    files_name = sorted(get_files_name(folder_path))

    for file_path in files_name:
        vocab = vocab_list(vocab=vocab, file_path=file_path, folder_path=folder_path)

    return vocab

### Scraping

In [None]:
def generate_encoded_url(word: str) -> str:
    base_url = "https://chine.in/mandarin/dictionnaire/index.php?mot="
    encoded_word = "".join([f"%26%23{ord(char)}%3B" for char in word])
    full_url = base_url + encoded_word
    return full_url


def search_mandarin_dictionary_requests(query: str) -> str:
    url = generate_encoded_url(query)
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
        )
    }
    data = {"q": query, "Submit": "1"}

    response = requests.post(url, headers=headers, data=data, timeout=3)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")
    results = soup.find_all(class_="table invert_img", id="resultats_dico")
    return results[0]

### Google translate

### Class

In [None]:
class TraductionExtractor:
    def __init__(self, query: str):
        self.query = query
        self.results_raw = ""
        self.results_processed = ""

    def fetch_results(self):
        results_raw = search_mandarin_dictionary_requests(self.query)
        self.results_raw = str(results_raw)
        self.results_processed = results_raw.get_text()

    def remove_html_tags(self, text: list) -> list:
        return [re.sub(r"<.*?>", "", s) for s in text]

    def extract_between_markers(
        self, result: str, start_marker: str, end_marker: str
    ) -> str:
        start_index = result.find(start_marker)
        if start_index == -1:
            return None

        start_index += len(start_marker)
        if end_marker:
            end_index = result.find(end_marker, start_index)
            if end_index == -1:
                return None
            return result[start_index:end_index]
        else:
            return result[start_index:]

    def get_traduction(self) -> str:
        start_marker = f"Entrées pour {self.query}"
        end_marker = "Entrées commençant par"
        traduction = self.extract_between_markers(
            result=self.results_raw,
            start_marker=start_marker,
            end_marker=end_marker,
        )

        if not traduction:
            traduction = self.extract_between_markers(
                result=self.results_raw,
                start_marker="Traduction",
                end_marker="Editer (projet CFDICT)",
            )
        return traduction

    def return_traduction(self):
        traduction = self.get_traduction()

        if traduction and "<li>" in traduction:
            traduction = traduction.split("</li>")[:-1]
        traduction = self.remove_html_tags(traduction)
        return traduction

    def get_pinyin(self) -> str:
        return pinyin.get(s=self.query, delimiter=" ")
    
    def google_translate(self) -> str:
        url_params = {"sl": "zh-CN", "tl": "fr", "q": self.query, "op": "translate"}
        response = requests.get(url="https://translate.google.com/m", params=url_params)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, "html.parser")
        element = soup.find("div", {"class": "result-container"})
        return element.text

In [None]:
processed_path = Path("data/processed/chinese_vocab.json")

if processed_path.exists():
    with open(processed_path, "r", encoding="utf-8") as f:
        existing_vocab = json.load(f)
else:
    existing_vocab = {}

all_vocab = build_vocab_list(vocab=existing_vocab, folder_path="data/raw")

try:
    for word in tqdm(all_vocab.keys(), desc="Processing vocabulary", unit="word"):
        if all_vocab[word] == {}:
            extractor = TraductionExtractor(word)
            extractor.fetch_results()
            traduction = extractor.return_traduction()
            pinyin = extractor.get_pinyin()
            all_vocab[word]["traduction"] = traduction
            all_vocab[word]["pinyin"] = pinyin
except ConnectionError as e:
    print(f"Connection error occurred: {e}")
    save_progress(all_vocab, processed_path)
    raise ConnectionError("Failed to fetch data. Progress has been saved.") from e
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    save_progress(all_vocab, processed_path)
    raise RuntimeError("An error occurred. Progress has been saved.") from e

save_progress(all_vocab, processed_path)

In [52]:
def open_json_file(file_path: str) -> dict:
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

chinese_vocab = open_json_file("../data/processed/chinese_vocab.json")

In [54]:
character_list = set()
for vocab in chinese_vocab:
    character_list.add(vocab["character"])
len(character_list)

495

In [43]:
def google_translate(query: str) -> str:
    url_params = {"sl": "zh-CN", "tl": "fr", "q": query, "op": "translate"}
    response = requests.get(url="https://translate.google.com/m", params=url_params)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, "html.parser")
    element = soup.find("div", {"class": "result-container"})
    return [element.text]

In [50]:
new_vocab = []
for key, value in chinese_vocab.items():
    word = {"character": key, "traduction": value.get("traduction"), "pronunciation": value.get("pinyin")}
    if word["traduction"] == [""]:
        word["traduction"] = google_translate(key)
    if not word["pronunciation"]:
        word["pronunciation"] = pinyin.get(key, delimiter=" ")
    new_vocab.append(word)

In [51]:
save_progress(new_vocab, Path("../data/processed/chinese_vocab.json"))

Progress saved to ../data/processed/chinese_vocab.json
