# Libraries

In [63]:
import json
from typing import Any, Union

from lxml import etree

# Functions

## Data Load

In [65]:
def convert_to_list (value: Any) -> list:
    if type(value) is not list:
        value = [value]
    return value

def save_json(vocab, path):
    """
    Save progress to a JSON file.

    Args:
        vocab (dict): The vocabulary dictionary to save.
        path (Path): The path to the JSON file.
    """
    with open(path, "w", encoding="utf-8") as f:
        json.dump(vocab, f, indent=4, ensure_ascii=False)
    print(f"Progress saved to {path}")

## Data Processing

In [None]:
def get_cards(root: etree.Element) -> list:
    cards = []
    for card in root.find('cards').findall('card'):    
        
        entry = card.find('entry')
        try:
            definition = entry.find('defn').text
            category = card.find('catassign').attrib['category'] 
            if "Cours1" in category or "Cours2" in category or "Cours3" in category:
                continue
        except AttributeError:
            definition = None
            category = None
        
        headword = entry.find('headword').text
        pronunciation = entry.find('pron').text
        
        score_info = card.find('scoreinfo')
        
        card_details = {
            'character': headword,
            'pronunciation': pronunciation,
            'traduction': definition,
            'category': category,
            'score': score_info.attrib.get('score') if score_info is not None else None,
            'difficulty': score_info.attrib.get('difficulty') if score_info is not None else None,
            'correct': score_info.attrib.get('correct') if score_info is not None else None,
            'incorrect': score_info.attrib.get('incorrect') if score_info is not None else None,
            'reviewed': score_info.attrib.get('reviewed') if score_info is not None else None,
        }
        
        cards.append(card_details)

    return cards

In [68]:
def remove_cards_by_categories(root, keywords: Union[list, str]) -> etree.Element:
    """
    Remove all cards where the category contains any keyword from a list.

    Args:
        root (xml.etree.ElementTree.Element): The root of the XML tree.
        keywords (Union[list, str]): A list of keywords to match against the category.

    Returns:
        xml.etree.ElementTree.Element: The modified root element.
    """
    keywords = convert_to_list(keywords)
    cards = root.find('cards')  # Locate the <cards> section
    for card in list(cards):  # Iterate over a copy of the cards to avoid runtime issues
        catassign = card.find('catassign')  # Locate the <catassign> element
        if catassign is not None:  # Ensure <catassign> exists
            category = catassign.attrib.get('category', '')  # Default to an empty string if 'category' is missing
            if any(keyword.lower() in category.lower() for keyword in keywords):  # Match any keyword
                cards.remove(card)
    return root


# Run

In [72]:
tree = etree.parse('../data/raw/flash-2411112247.xml')
root = tree.getroot()

remove_categories = ['Cours1 ', 'Cours2 ', 'Cours3 ', 'Question Answer Voca']
card_details = remove_cards_by_categories(root, remove_categories)

card_details = get_cards(card_details)

In [75]:
for word in card_details:
    if word['headword'] == "特别":
        target_word = word
        break
definition = target_word['definition']

In [78]:
word_family = []
for word in card_details:
    if word['definition'] is None:
        continue
    family = word['definition'].split(" ")[0]
    if family not in word_family:
        word_family.append(family)
word_family = sorted(word_family)
print(word_family)

['(shuí)', '1', 'Daurade', 'T-shirt', 'adjective', 'adverb', 'affix', 'argumentation', 'auxiliary', 'be', 'come', 'idiom', 'man', 'mother’s', 'noun', 'preposition', 'pronoun', 'salmon', 'short', 'surname', 'take', 'to', 'verb']


In [81]:
definition.split("\n")

['adjective special; particular; out of the ordinary 特別的愛好 tèbié de àihào an unusual hobby 他的口音很特別。 Tā de kǒuyin hěn tèbié. He has a peculiar accent. 這棟樓的設計沒有什麼特別的地方。 Zhè dòng lóu de shèjì méiyǒu shénmetèbié de dìfang. The design of this building is nothing out of the ordinary. 這種花每當夜晚就散發出一種特別的香味兒。 Zhèzhǒng huā měidāng yèwǎn jiù sànfā chū yī zhǒng tèbié de xiāngwèir. As evening approaches, this flower gives off a distinctive fragrance. ',
 "adverb 1 especially; particularly 質量特別好 zhìliàng tèbié hǎo of extra fine quality 工作特別努力 Gōngzuò tèbié nǔlì especially hard-working 建設一支特別能戰鬥的隊伍 jiànshè yī zhī tèbié néng zhàndòu de duìwu train a contingent of exceptional fighters 今天特別熱。 Jīntiān tèbié rè. It’s extremely hot today. 這個報告特別有教育意義。 zhège bàogào tèbié yǒu jiàoyù yìyì. The talk was particularly instructive. 2 for a special purpose; specially; deliberately 他臨走時特別去向女主人道謝。 Tā línzǒu shí tèbié qùxiàng nǚzhǔrén dàoxiè. He made a point of thanking his hostess before he left the party. 這些花是我特別為你摘的

In [None]:
import re
from typing import List, Dict

def parse_chinese_definition(definition: str) -> Dict[str, List[Dict[str, List[str]]]]:
    """
    Parses a Chinese word definition into structured categories, definitions, and examples.
    
    Args:
        definition (str): The raw definition text.
    
    Returns:
        Dict[str, List[Dict[str, List[str]]]]: A dictionary where each key is a part of speech,
        and each value is a list of definitions with their examples.
    """
    # Liste des catégories grammaticales possibles
    parts_of_speech = [
        'adjective',
        'adverb',
        'affix',
        'auxiliary',
        'idiom',
        'noun',
        'preposition',
        'pronoun',
        'surname',
        'verb'
    ]

    # Préparation du motif regex pour trouver les catégories grammaticales
    pos_pattern = r'\b(' + '|'.join(parts_of_speech) + r')\b'

    # Trouver toutes les occurrences des catégories grammaticales dans le texte
    matches = list(re.finditer(pos_pattern, definition))

    # Si aucune catégorie n'est trouvée, retourner un dictionnaire vide
    if not matches:
        return {}

    parsed_data = {}

    # Ajouter une position de fin pour faciliter le découpage
    positions = [match.start() for match in matches] + [len(definition)]
    parts = [match.group(1) for match in matches]

    # Parcourir chaque segment entre les catégories grammaticales
    for idx, pos in enumerate(positions[:-1]):
        current_pos = parts[idx]
        start_idx = pos
        end_idx = positions[idx+1]
        text = definition[start_idx:end_idx].strip()
        # Supprimer la catégorie grammaticale du début du texte
        text = re.sub(r'^\b' + re.escape(current_pos) + r'\b', '', text).strip()
        definitions = split_definitions(text)
        parsed_data[current_pos] = definitions

    return parsed_data

def split_definitions(text: str) -> List[Dict[str, List[str]]]:
    """
    Splits the text into individual definitions and their examples.
    
    Args:
        text (str): The text containing definitions and examples.
    
    Returns:
        List[Dict[str, List[str]]]: A list of dictionaries with 'definition' and 'examples'.
    """
    definitions = []
    # Séparer les définitions numérotées ou les différentes entrées
    def_splits = re.split(r'(?=\d+\s)', text)
    for def_text in def_splits:
        def_text = def_text.strip()
        if not def_text:
            continue
        # Extraire le numéro de définition s'il existe
        match = re.match(r'^(\d+)\s', def_text)
        if match:
            def_number = match.group(1)
            def_text = def_text[len(def_number):].strip()
        else:
            def_number = None
        # Séparer la définition des exemples
        example_splits = re.split(r'(?<=\.)\s+', def_text)
        definition = example_splits[0]
        examples = example_splits[1:] if len(example_splits) > 1 else []
        # Nettoyer les exemples pour éliminer les espaces inutiles
        examples = [ex.strip() for ex in examples if ex.strip()]
        definitions.append({
            'definition': definition,
            'examples': examples
        })
    return definitions


{'adjective': [{'definition': 'special; particular; out of the ordinary 特別的愛好 '
                              'tèbié de àihào an unusual hobby 他的口音很特別。 Tā de '
                              'kǒuyin hěn tèbié.',
                'examples': ['He has a peculiar accent.',
                             '這棟樓的設計沒有什麼特別的地方。 Zhè dòng lóu de shèjì méiyǒu '
                             'shénme tèbié de dìfang.',
                             'The design of this building is nothing out of '
                             'the ordinary.',
                             '這種花每當夜晚就散發出一種特別的香味兒。 Zhèzhǒng huā měidāng yèwǎn '
                             'jiù sànfā chū yī zhǒng tèbié de xiāngwèir.',
                             'As evening approaches, this flower gives off a '
                             'distinctive fragrance.']}],
 'adverb': [{'definition': 'especially; particularly 質量特別好 zhìliàng tèbié hǎo '
                           'of extra fine quality 工作特別努力 Gōngzuò tèbié nǔlì '
                       

In [86]:
parsed_result

{'adjective': [{'definition': 'adjective', 'examples': []}],
 'adverb': [{'definition': 'adverb', 'examples': []}]}

In [None]:
save_json(cards, '../data/processed/chinese_cards.json')