# Run sandbox notebook

In [1]:
%run sandbox.ipynb

Current working directory: /home/darrenyhuang/projects/pleco-anki-server/src/resources
Loading credentials from token.json
Refreshing expired credentials
Failed to refresh credentials: ('invalid_grant: Bad Request', {'error': 'invalid_grant', 'error_description': 'Bad Request'})
Authenticating with Google Drive
Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=624862442234-4iconv0m99pu1luia7lub9c8m7t8of0b.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A45942%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&state=SD38PdBTvlmJgwJMDmIIVGUktkroIy&access_type=offline


In [3]:
main()

Latest flashcard xml last modified time: 2025-01-02 12:17:32 EST-0500
1648 flashcard entries found| 1 error entries found


# Helper Methods

In [4]:
to_drop = [1030, 1118]

def drop(arr, to_drop):
    arr = arr.copy()
    for i in to_drop:
        arr.pop(i)
    return arr

In [5]:
import json

with open('flashcard_entries.json', 'r') as file:
    flashcard_entries = json.load(file)

to_del = []
for i, entry in enumerate(flashcard_entries):
    pinyin = ''.join(entry['pinyin'])
    anki_pinyin = entry['anki_pinyin']
    if pinyin != anki_pinyin:
        # print(i, entry['traditional'], pinyin, anki_pinyin)
        to_del.append(i)
for i in reversed(to_del):
    del flashcard_entries[i]
        


In [6]:
from utils.pinyin import *
from utils.html import *

In [7]:


# Example usage
file_path = "cedict_ts.u8"  # Path to the CC-CEDICT file
fifth_tone_pinyins = extract_fifth_tone_pinyin(file_path)
# print("Pinyin with the 5th tone:", fifth_tone_pinyins)

toneless_pinyin_set = extract_toneless_pinyin(file_path)
toneless_pinyin_trie = create_trie_from_pinyin(toneless_pinyin_set)


In [8]:
"ma" in fifth_tone_pinyins

True

In [9]:
import regex as re
import regex
import functools
import csv
import os

### Load CC-CEDICT (Only Traditional Variants) ###
@functools.lru_cache(maxsize=None)  # Infinite cache size
def load_cc_cedict(filename="cedict_ts.u8"):
    """Parses CC-CEDICT to extract traditional-only variant mappings."""
    variants = {}
    with open(filename, encoding="utf-8") as f:
        for line in f:
            if line.startswith("#") or not line.strip():
                continue
            match = re.match(r"(\S+) (\S+) \[.*?\] /(.*?)/", line)
            if match:
                trad, simp, definition = match.groups()

                # Extract explicit variants from definitions: "/variant of X|Y[pinyin]/"
                variant_match = re.search(r"variant of ([\u4E00-\u9FFF\|]+)", definition)
                if variant_match:
                    var = variant_match.group(1).split("|")[0]
                    # Ensure bidirectional mapping
                    variants.setdefault(trad, set()).add(var)
                    variants.setdefault(var, set()).add(trad)

    return variants

### Load CC-CEDICT (Only Traditional Variants) ###
@functools.lru_cache(maxsize=None)  # Infinite cache size
def load_moedict(filename="moedict.csv"):
    variants = {}

    # Open the moedict.csv file
    with open(filename, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        
        # Iterate through each row in the CSV
        for row in reader:
            term = row['字詞名']
            definition = row['釋義']
            
            # Search for variants indicated by 也作「<VARIANT>」
            matches = re.findall(r'也作「(.*?)」', definition)
            all_words = set([term] + list(matches))
            
            # Print the term and its variants
            for variant in all_words:
                variants.setdefault(variant, set()).update(all_words.difference({variant}))
    
    return variants

@functools.lru_cache(maxsize=None)  # Infinite cache size
def get_c_variants(folder_path="c"):
    """
    Iterates through all files in the specified folder and prints their contents.

    Args:
        folder_path (str): Path to the folder containing files.
    """
    global test_txt
    variants = {}
    
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

            # Print the contents of the JSON file
            # data_json = json.dumps(data, indent=4, ensure_ascii=False)
            if file_name.startswith("@") or file_name.startswith("=") or file_name.startswith("xref"):
                continue
            non_chinese_or_bracket = regex.compile(r'[^「」\p{Han}]')
            word = regex.sub(non_chinese_or_bracket, "", data["t"])
            definitions = [
                regex.sub(non_chinese_or_bracket, "", d["f"])
                for h in data.get('h', [])  # Start from the 'h' key
                for d in h.get('d', [])  # Look inside the 'd' list
            ]
            for definition in definitions:
                # Search for variants indicated by 也作「<VARIANT>」
                matches = re.findall(r'也作「(.*?)」', definition)
                all_words = set([word] + list(matches))
                
                # Print the term and its variants
                if matches:
                    for variant in all_words:
                        variants.setdefault(variant, set()).update(all_words.difference({variant}))
    
    return variants

@functools.lru_cache(maxsize=None)  # Infinite cache size
def load_unihan_variants(filename="Unihan_Variants.txt"):
    """Parses Unihan_Variants.txt for character-level variants (traditional-only)."""
    variants = {}
    with open(filename, encoding="utf-8") as f:
        for line in f:
            if line.startswith("#") or not line.strip():
                continue
            parts = line.split("\t")
            if len(parts) >= 3:
                char = chr(int(parts[0][2:], 16))  # Convert U+XXXX to character
                key = parts[1]
                
                # Extract only the U+XXXX part before '<' (if present)
                var_code = parts[2].split("<")[0].strip()
                
                try:
                    var = chr(int(var_code[2:], 16))  # Convert U+XXXX to character
                    
                    # Only keep traditional variants (ignore kSimplifiedVariant)
                    if "Variant" in key and "Simplified" not in key and "Traditional" not in key:
                        variants.setdefault(char, set()).add(var)
                        variants.setdefault(var, set()).add(char)
                except ValueError:
                    pass  # Skip bad format entries
    
    return variants


@functools.lru_cache(maxsize=None)  # Infinite cache size
def load_manual_variants(filename="manual_variants.csv"):
    """Parses manual_variants.csv to extract bidirectional variant mappings."""
    variants = {}
    
    with open(filename, 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        
        for row in reader:
            if len(row) < 2:
                continue  # Skip malformed rows
            
            words = set(word.strip() for word in row if word.strip())
            
            for word in words:
                variants.setdefault(word, set()).update(words.difference({word}))
    
    return variants


def get_variants(word):
    return load_cc_cedict().get(word, set()).union(load_moedict().get(word, set())).union(get_c_variants().get(word, set())).union(load_unihan_variants().get(word, set())).union(load_manual_variants().get(word, set()))


In [10]:
load_cc_cedict()
load_moedict()
get_c_variants()
load_unihan_variants()
load_manual_variants()
""

''

# label segs

In [11]:
import unicodedata

def fullwidth_to_ascii(text):
    return ''.join(
        unicodedata.normalize('NFKC', char) for char in text
    )

# # Example usage
# fullwidth_text = "ｉＯＳ 10"
# fullwidth_to_ascii(fullwidth_text)

In [12]:
def overlap_length(chinese: str, english: str) -> int:
    # Match all trailing non-Han characters from the Chinese string
    match = re.search(r'.*[\p{Han}]', chinese)
    if match:
        non_chinese_suffix = chinese[match.end():]  # Get trailing non-Han portion
    else:
        non_chinese_suffix = chinese  # Entire string is non-Han
    
    # Check how much of this suffix matches the start of the English string
    overlap = 0
    for i in range(1, len(non_chinese_suffix) + 1):
        if english.startswith(non_chinese_suffix[-i:]):
            overlap = i
    
    return overlap

In [13]:
chinese = "升級到 ｉＯＳ 10"
english = "ｉＯＳ 10 Upgrade to iOS 10."
print(overlap_length(chinese, english))  # Output: 6

6


In [14]:
# segment definition

import regex as re
import json

# Load the part of speech keywords from the JSON file
with open("part_of_speech_keywords.json", "r") as file:
    part_of_speech_keywords = json.load(file)


def label_segments(text):
    segments = []
    part_of_speech_pattern = re.compile(
        r"\b("
        + "|".join(re.escape(keyword) for keyword in part_of_speech_keywords)
        + r")\b",
        re.IGNORECASE,
    )
    chinese_pattern = re.compile(r"[^\s\(\)\[\]]*\p{Han}+[^\s\(\)\[\]]*")
    pinyin_pattern = re.compile(
        r"\S*[āēīōūǖĀĒĪŌŪǕáéíóúǘÁÉÍÓÚǗǎěǐǒǔǚǍĚǏǑǓǙàèìòùǜÀÈÌÒÙǛ]\S*"
    )
    english_brackets_pattern = re.compile(r"\[[^\[\]]*\]")
    english_paren_pattern = re.compile(r"\([^\(\)]*\)")
    pleco_uead_pattern = re.compile(r"\uead1.*?\uead2")

    pos_matches = list(part_of_speech_pattern.finditer(text))
    uead_matches = list(pleco_uead_pattern.finditer(text))
    chinese_matches = list(chinese_pattern.finditer(text))
    pinyin_matches = list(pinyin_pattern.finditer(text))
    english_brackets_matches = list(english_brackets_pattern.finditer(text))
    english_paren_pattern = list(english_paren_pattern.finditer(text))

    all_matches = sorted(
        pos_matches
        + english_brackets_matches
        + english_paren_pattern
        + uead_matches
        + chinese_matches
        + pinyin_matches,
        key=lambda x: x.start(),
    )

    last_end = 0
    for match in all_matches:
        if match.start() < last_end:
            continue

        if match.start() > last_end:
            segments.append(
                {"segment": text[last_end : match.start()].strip(), "label": "english"}
            )
        if match in pos_matches:
            segments.append(
                {"segment": match.group().strip(), "label": "part of speech"}
            )
        elif match in english_brackets_matches:
            segments.append({"segment": match.group().strip(), "label": "english"})
        elif match in english_paren_pattern:
            target_str = match.group().strip()
            if re.match(pinyin_pattern, target_str):
                segments.append({"segment": target_str, "label": "pinyin"})
            elif re.match(r"^[\p{Han}《》=]+$", re.sub(r"[\(\)\s]", "", target_str)):
                segments.append({"segment": target_str, "label": "chinese"})
            else:
                segments.append({"segment": target_str, "label": "english"})
        elif match in uead_matches:
            segments.append({"segment": match.group().strip(), "label": "english"})
        elif match in chinese_matches:
            segments.append({"segment": match.group().strip(), "label": "chinese"})
        elif match in pinyin_matches:
            segments.append({"segment": match.group().strip(), "label": "pinyin"})
        last_end = match.end()

    if last_end < len(text):
        segments.append({"segment": text[last_end:].strip(), "label": "english"})

    segments = filter_white_space(segments)
    segments = combine_adjacent_segments(segments)
    segments = process_fifth_tone_pinyin(segments)
    segments = filter_white_space(segments)
    segments = combine_adjacent_segments(segments)
    segments = process_item_numbers(segments)
    segments = filter_white_space(segments)
    segments = combine_pinyin_english_pinyin(segments)
    segments = combine_example_sentences(segments)
    segments = combine_adjacent_segments(segments, {("english", "chinese"): "english"})

    return segments


def update_example_sentence_english_chinese_overlap(segment):
    if segment["label"] == "example_sentence":
        chinese = segment["chinese"]
        english = segment["english"]
        overlap = overlap_length(chinese, english)
        if overlap > 0:
            segment["english"] = english[overlap:]
            segment["pinyin"] += " " + english[:overlap]
            # print(chinese, english, segment)
    return segment


def combine_example_sentences(segments):
    new_segments = []
    i = 0
    while i + 2 < len(segments):
        if (
            segments[i]["label"] == "chinese"
            and segments[i + 1]["label"] == "pinyin"
            and segments[i + 2]["label"] == "english"
        ):
            combined_segment = {
                "label": "example_sentence",
                "chinese": segments[i]["segment"],
                "pinyin": segments[i + 1]["segment"],
                "english": segments[i + 2]["segment"],
            }
            combined_segment = update_example_sentence_english_chinese_overlap(combined_segment)
            new_segments.append(combined_segment)
            i += 3
        elif (  # special case
            i + 3 < len(segments)
            and segments[i]["label"] == "chinese"
            and segments[i + 1]["label"] == "english"
            # and len(segments[i + 1]["segment"]) <= 1
            and segments[i + 2]["label"] == "pinyin"
            and segments[i + 3]["label"] == "english"
        ):
            combined_segment = {
                "label": "example_sentence",
                "chinese": segments[i]["segment"],
                "pinyin": segments[i + 2]["segment"],
                "english": segments[i + 3]["segment"],
            }
            extra_segment = segments[i + 1]["segment"]
            if extra_segment == "。":
                combined_segment["chinese"] += extra_segment
            elif combined_segment["chinese"].startswith(extra_segment):
                combined_segment["pinyin"] = extra_segment + " " + combined_segment["pinyin"]
            else:
                combined_segment["chinese"] += " " + extra_segment
            combined_segment = update_example_sentence_english_chinese_overlap(combined_segment)

            # print("special case", combined_segment)
            # print(segments[i + 1])
            new_segments.append(combined_segment)
            i += 4
        else:
            new_segments.append(segments[i])
            i += 1
    new_segments.extend(segments[i:])
    return new_segments


def process_item_numbers(segments):
    # Process sequences between "part of speech" segments
    def search(segment_str, num, is_one=False):
        regex_patt = str(num) + r"(?=($|\s))"
        if is_one:
            regex_patt = r"(^|(\(-//-\) )|(\uead2 ))" + regex_patt
        else:
            regex_patt = r"(?<=(^|\s))" + regex_patt
        return re.search(regex_patt, segment_str)

    new_segments = []
    i = 0
    num = 1
    while i < len(segments):
        seg = segments[i]
        if (
            seg["label"] == "english"
            and (search(seg["segment"], 1, is_one=True) or (num != 1 and search(seg["segment"], num)))
        ):
            if search(seg["segment"], 1, is_one=True):
                num = 1
            
            start_index = search(seg["segment"], num).start()
            new_segments.append(
                {
                    "segment": seg["segment"][:start_index].strip(),
                    "label": "english",
                }
            )
            new_segments.append(
                {"segment": str(num), "label": "item_number"}
            )
            seg["segment"] = seg["segment"][
                start_index + len(str(num)) :
            ].strip()
            num += 1
            continue  # don't update i
        else:
            new_segments.append(seg)
            i += 1
    return new_segments


def filter_white_space(segments):
    return [segment for segment in segments if segment["segment"].strip()]


def combine_adjacent_segments(segments, equivalent_labels=None):
    if equivalent_labels is None:
        equivalent_labels = {}
    else:
        equivalent_labels = {
            tuple(sorted(list(k))): v for k, v in equivalent_labels.items()
        }

    combined_segments = []
    for segment in segments:
        if combined_segments and segment["label"] != "example_sentence":
            last_label = combined_segments[-1]["label"]
            current_label = segment["label"]
            eq_label = tuple(sorted([last_label, current_label]))
            if last_label == current_label or eq_label in equivalent_labels:
                combined_segments[-1]["segment"] += " " + segment["segment"]
                if eq_label in equivalent_labels:
                    combined_segments[-1]["label"] = equivalent_labels[eq_label]
            else:
                combined_segments.append(segment)
        else:
            combined_segments.append(segment)
    return combined_segments


def process_fifth_tone_pinyin(segments):
    new_segments = []
    for i in range(len(segments) - 1):
        current_segment = segments[i]
        next_segment = segments[i + 1]
        new_segments.append(current_segment)

        done = False
        while not done:
            if (
                current_segment["label"] in ["chinese", "pinyin"]
                and next_segment["label"] == "english"
            ):
                for pinyin in fifth_tone_pinyins:
                    regex_patt = "^" + pinyin + r"($|[^a-zA-Z])"
                    mtch = re.match(regex_patt, next_segment["segment"].lower())
                    if mtch:
                        pinyin_seg, rest = (
                            next_segment["segment"][: mtch.end()],
                            next_segment["segment"][mtch.end() :],
                        )

                        new_segments.append(
                            {"segment": pinyin_seg.strip(), "label": "pinyin"}
                        )
                        next_segment["segment"] = rest.strip()
                        break
                else:
                    done = True
            else:
                done = True
    new_segments.append(segments[-1])
    return new_segments


def combine_pinyin_english_pinyin(segments):
    new_segments = []
    segments = segments.copy()
    i = 0
    while i + 2 < len(segments):
        if (
            segments[i]["label"] == "pinyin"
            and segments[i + 1]["label"] == "english"
            and segments[i + 2]["label"] == "pinyin"
        ):
            combined_segment = {
                "segment": segments[i]["segment"] + " " + segments[i + 1]["segment"] + " " + segments[i + 2]["segment"],
                "label": "pinyin",
            }
            i += 2
            segments[i] = combined_segment
        else:
            new_segments.append(segments[i])
            i += 1
    new_segments.extend(segments[i:])
    return new_segments

In [15]:
def update_example_sentence_with_variants(traditional_word, segment):
    """
    Updates example sentences in the segments list by checking if the traditional word or its variants
    are present in the example sentences. If a variant is found, it is added to the example sentence dict.

    Args:
        traditional_word (str): The traditional Chinese phrase to check.
        segments (list): List of segment dictionaries.

    Returns:
        list: Updated list of segments.
    """
    # Get the variants of the traditional word
    word_variants = get_variants(traditional_word)
    if segment['label'] == 'example_sentence':
        # Check if the traditional word is not in the Chinese segment
        if traditional_word not in segment['chinese']:
            # Search through the variants
            for variant in word_variants:
                if variant in segment['chinese']:
                    # Add the variant to the example sentence dict
                    segment['variant'] = variant
                    return True
    return False

def update_example_sentence_with_separated_words(traditional_word, segment, max_len=6):
    for word in [traditional_word] + list(get_variants(traditional_word)):
        if len(word) == 2 and segment['label'] == 'example_sentence' and word not in segment['chinese']:
            char1, char2 = word
            pattern = f"{char1}.{{0,{max_len}}}{char2}"
            match = re.search(pattern, segment['chinese'])
            # print(pattern, segment['chinese'], match)
            if not match:
                continue
            separated_words = match.group()
            segment['separated_word'] = separated_words
            return True
    return False

In [12]:
my_entry = None
for entry in flashcard_entries:
    if entry['traditional'] == '蒙':
        my_entry = entry
        break
label_segments(my_entry['definition'])

[{'segment': 'noun', 'label': 'part of speech'},
 {'segment': '1', 'label': 'item_number'},
 {'segment': 'literary', 'label': 'part of speech'},
 {'segment': 'ignorance; illiteracy', 'label': 'english'},
 {'label': 'example_sentence',
  'chinese': '啟蒙',
  'pinyin': 'qǐméng',
  'english': 'initiate learning'},
 {'segment': '2', 'label': 'item_number'},
 {'segment': '(Méng) a', 'label': 'pinyin'},
 {'segment': 'surname', 'label': 'english'},
 {'segment': 'verb', 'label': 'part of speech'},
 {'segment': '1', 'label': 'item_number'},
 {'segment': 'cover; overspread', 'label': 'english'},
 {'label': 'example_sentence',
  'chinese': '蒙住眼睛',
  'pinyin': 'Méngzhù yǎnjing',
  'english': 'blindfold'},
 {'label': 'example_sentence',
  'chinese': '蒙頭睡大覺',
  'pinyin': 'méng tóu shuì dà jiào',
  'english': 'tuck oneself in and sleep like a log'},
 {'label': 'example_sentence',
  'chinese': '蒙上一層灰塵',
  'pinyin': 'méng shàng yī céng huīchén',
  'english': 'be covered with a layer of dust'},
 {'segment

In [16]:
for entry in flashcard_entries:
    segments = label_segments(entry['definition'])
    for i, segment in enumerate(segments):
        if segment['label'] == 'pinyin' and i < len(segments) - 2:
            next_segment = segments[i + 1]
            next_next_segment = segments[i + 2]
            if next_segment['label'] == 'english' and next_next_segment['label'] == 'pinyin':
                print(segments)
                print(segment, next_segment, next_next_segment)
                print(entry['definition'])
                print()

In [17]:
import re
from collections import defaultdict

    
@functools.lru_cache(maxsize=None)  # Infinite cache size
def parse_cedict_toneless_pinyins(filename="cedict_ts.u8"):
    """Parse cedict_ts.u8 and extract a dictionary mapping Chinese characters to toneless Pinyin."""
    char_to_pinyin = defaultdict(set)
    
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith("#"):
                continue  # Skip comment lines
            
            parts = line.split()
            if len(parts) < 3:
                continue  # Skip malformed lines
            
            traditional, simplified, *pinyin_parts = parts
            
            pinyin_bracket_match = re.search(r'\[(.*?)\]', line)
            if not pinyin_bracket_match:
                continue
            
            pinyin_string = re.sub(r"[^A-Za-z ü]", "", pinyin_bracket_match.group(1).replace("u:", "ü"))
            pinyin_with_tones = pinyin_string.split()
            pinyin_toneless = [p for p in pinyin_with_tones]
            
            for char, pinyin_ in zip(traditional, pinyin_toneless):
                char_to_pinyin[char].add(pinyin_.lower())
                if pinyin_ == "-":
                    print(line)
    
    return char_to_pinyin

In [76]:
import regex
import regex as re
from pypinyin import pinyin, Style

@functools.lru_cache(maxsize=None)  # Infinite cache size
def load_manual_pinyins(filename="manual_pinyins.csv"):
    """Parses manual_pinyins.csv to extract bidirectional variant mappings."""
    pinyins = {}
    
    with open(filename, 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        
        next(reader)  # Skip the header row
        for row in reader:
            if len(row) < 2:
                continue  # Skip malformed rows
            
            pinyins.setdefault(row[0], set()).add(row[1])  # add the pinyin to the set of pinyins for this character
    
    return pinyins


def strip_tone_marks(pinyin_with_tone):
    """
    Removes tone marks from pinyin to get the base pinyin (5th tone equivalent).
    Handles both lowercase and uppercase vowels with tone marks.
    
    Args:
        pinyin_with_tone (str): Pinyin with tone marks
        
    Returns:
        str: Pinyin without tone marks
    """
    # Map of vowels with tone marks to base vowels (lowercase)
    tone_marks_map_lower = {
        'ā': 'a', 'á': 'a', 'ǎ': 'a', 'à': 'a',
        'ē': 'e', 'é': 'e', 'ě': 'e', 'è': 'e',
        'ī': 'i', 'í': 'i', 'ǐ': 'i', 'ì': 'i',
        'ō': 'o', 'ó': 'o', 'ǒ': 'o', 'ò': 'o',
        'ū': 'u', 'ú': 'u', 'ǔ': 'u', 'ù': 'u',
        'ǖ': 'ü', 'ǘ': 'ü', 'ǚ': 'ü', 'ǜ': 'ü', 'ü': 'ü'
    }
    
    # Map of vowels with tone marks to base vowels (uppercase)
    tone_marks_map_upper = {
        'Ā': 'A', 'Á': 'A', 'Ǎ': 'A', 'À': 'A',
        'Ē': 'E', 'É': 'E', 'Ě': 'E', 'È': 'E',
        'Ī': 'I', 'Í': 'I', 'Ǐ': 'I', 'Ì': 'I',
        'Ō': 'O', 'Ó': 'O', 'Ǒ': 'O', 'Ò': 'O',
        'Ū': 'U', 'Ú': 'U', 'Ǔ': 'U', 'Ù': 'U',
        'Ǖ': 'Ü', 'Ǘ': 'Ü', 'Ǚ': 'Ü', 'Ǜ': 'Ü', 'Ü': 'Ü'
    }
    
    # Combine both maps
    tone_marks_map = {**tone_marks_map_lower, **tone_marks_map_upper}
    
    result = ''
    for char in pinyin_with_tone:
        result += tone_marks_map.get(char, char)
    
    return result

convert_punc_dict = {"。": ".", "！": "!", "？": "?", "，": ",", "；": ";", "：": ":"}
def convert_punc(char):
    return convert_punc_dict.get(char, char)


In [77]:

def split_chinese_pinyin(example_sentence, trad_word=None, print_debug=False):
    try:
        return split_chinese_pinyin_helper(example_sentence, rmv_paren=False, trad_word=trad_word, print_debug=print_debug)
    except ValueError:
        return split_chinese_pinyin_helper(example_sentence, rmv_paren=True, trad_word=trad_word, print_debug=print_debug)

def split_chinese_pinyin_helper(example_sentence, rmv_paren, trad_word=None, print_debug=False):
    """
    Splits a Chinese string and its corresponding pinyin string into matching lists.
    Uses the regex package with {Han} pattern for accurate Chinese character detection.
    Handles both toned and tone-less pinyin matching, in both lowercase and uppercase.
    
    Args:
        example_sentence (dict): Dictionary containing 'chinese' and 'pinyin' keys
    
    Returns:
        dict: Updated example_sentence with 'chinese_list' and 'pinyin_list'
    
    Raises:
        ValueError: If the Chinese and pinyin strings cannot be properly aligned
    """
    if rmv_paren:
        chinese_string = re.sub(r"\([^\(\)]*\)", "", example_sentence['chinese'])
    else:
        chinese_string = example_sentence['chinese']
    pinyin_string = example_sentence['pinyin']

    # Remove tones from the input pinyin string for matching
    toneless_pinyin_string = ''.join(strip_tone_marks(char) for char in pinyin_string)

    chinese_list = []
    pinyin_list = []
    ignored_pinyin = ""

    i = 0
    remaining_pinyin = pinyin_string.strip()
    remaining_toneless_pinyin = toneless_pinyin_string.strip()
    
    while i < len(chinese_string):
        current_char = chinese_string[i]
        
        # Check if current character is Chinese using \p{Han} pattern
        if regex.match(r'\p{Han}', current_char):
            # Get all possible pinyins for this character
            possible_pinyins = list(set([p[0] for p in pinyin([current_char], style=Style.TONE, heteronym=True)]).union(load_manual_pinyins().get(current_char, set())).union(parse_cedict_toneless_pinyins().get(current_char, set())))
            
            # Also add tone-less versions of each pinyin
            toneless_pinyins = sorted([strip_tone_marks(p) for p in possible_pinyins], key=len, reverse=True)
            
            # Try to find a match in the remaining pinyin string
            match_found = False
            
            for idx, possible_toneless_pinyin in enumerate(toneless_pinyins):
                # Case-insensitive matching with tone-less pinyin
                if remaining_toneless_pinyin.lower().startswith(possible_toneless_pinyin.lower()):
                    # Find the actual corresponding toned pinyin
                    original_pinyin = possible_pinyins[idx]
                    
                    # Check for whitespace in the original pinyin string
                    whitespace_match = regex.match(r'^' + regex.escape(possible_toneless_pinyin) + r'(\s*)', remaining_toneless_pinyin, regex.IGNORECASE)
                    if whitespace_match:
                        toneless_pinyin_with_space = whitespace_match.group(0)
                        
                        chinese_list.append(current_char)
                        pinyin_list.append(remaining_pinyin[:len(toneless_pinyin_with_space)])
                        
                        # Remove the matched pinyin (with space) from the remaining strings
                        remaining_pinyin = remaining_pinyin[len(toneless_pinyin_with_space):]
                        remaining_toneless_pinyin = remaining_toneless_pinyin[len(toneless_pinyin_with_space):]
                        match_found = True
                        break
            
            if not match_found:
                # If no match found, try removing one character from the beginning of pinyin and try again
                if remaining_pinyin:
                    ignored_pinyin += remaining_pinyin[0]
                    remaining_pinyin = remaining_pinyin[1:]
                    remaining_toneless_pinyin = remaining_toneless_pinyin[1:]
                    i -= 1  # Stay on the same Chinese character to try again
                else:
                    if print_debug:
                        if trad_word:
                            print("Trad word:", trad_word)
                        print("Chinese list:", chinese_list)
                        print("Pinyin list:", pinyin_list)
                        print("Pinyin remaining:", pinyin_string[len("".join(pinyin_list)):])
                        print("Possible Pinyins:", possible_pinyins)
                    raise ValueError(f"Could not match pinyin for Chinese character '{current_char}' at position {i}")
                
            # print(remaining_pinyin)
            # print(remaining_toneless_pinyin)
        elif current_char in convert_punc_dict and remaining_toneless_pinyin.startswith(convert_punc_dict.get(current_char)):
            # Check for whitespace in the original pinyin string
            whitespace_match = regex.match(r'^' + regex.escape(convert_punc_dict.get(current_char)) + r'(\s*)', remaining_toneless_pinyin, regex.IGNORECASE)
            if whitespace_match:
                toneless_pinyin_with_space = whitespace_match.group(0)
                
                chinese_list.append(current_char)
                pinyin_list.append(remaining_pinyin[:len(toneless_pinyin_with_space)])
                
                # Remove the matched pinyin (with space) from the remaining strings
                remaining_pinyin = remaining_pinyin[len(toneless_pinyin_with_space):]
                remaining_toneless_pinyin = remaining_toneless_pinyin[len(toneless_pinyin_with_space):]
            else:
                raise ValueError(f"Could not match pinyin for Chinese punctuation '{current_char}' at position {i}")
        else:
            # For non-Chinese characters, add them to the lists and remove from pinyin if present
            non_chinese_segment = ""
            while i < len(chinese_string) and not regex.match(r'\p{Han}', chinese_string[i]):
                non_chinese_segment += chinese_string[i]
                i += 1
            
            # Only decrement i if we've reached the end of the string
            if i < len(chinese_string):
                i -= 1
            
            # Add the non-Chinese segment to the lists
            if non_chinese_segment:
                chinese_list.append(non_chinese_segment)
                
                # Try to match and remove the non-Chinese segment from the pinyin string
                if remaining_pinyin.startswith(non_chinese_segment):
                    pinyin_list.append(non_chinese_segment)
                    remaining_pinyin = remaining_pinyin[len(non_chinese_segment):]
                    remaining_toneless_pinyin = remaining_toneless_pinyin[len(non_chinese_segment):]
                else:
                    # Handle case where non-Chinese characters might not appear in pinyin
                    pinyin_list.append(non_chinese_segment)
        
        i += 1
    
    # Ensure all pinyin has been processed
    if remaining_pinyin.strip():
        # print(remaining_pinyin.strip(), trad_word, chinese_list, pinyin_list)
        example_sentence['english'] = remaining_pinyin + " " + example_sentence['english']
        example_sentence['pinyin'] = example_sentence['pinyin'][:-len(remaining_pinyin)].strip()
        pinyin_list[-1] = pinyin_list[-1].strip()
        # print(example_sentence)
    
    example_sentence['chinese_list'] = chinese_list
    example_sentence['pinyin_list'] = pinyin_list
    example_sentence['ignored_pinyin'] = ignored_pinyin.strip()
    
    return example_sentence

In [68]:
load_manual_pinyins.cache_clear()

# bkmark
1. [x] sort out problems with "令"
    1. [x] the number 1 in the middle of a english string
    2. [todo, if split chinese fails, convert all to english] song was made into pinyin somehow, creating an example sentence where there isn't supposed to be one
    3. [x] 《叨叨令》 should be counted as chinese -> issue is that the quotes in between // single char between examples
1. [] go through split chinese and detect really large leftover pinyins and debug

In [75]:
for entry in flashcard_entries:
    if entry['traditional'] == '故':
        segments = label_segments(entry['definition'])
        # for seg in segments:
        #     print(seg)
        example_sentences = [segment for segment in segments if segment['label'] == 'example_sentence']
        for ex_s in example_sentences:
            try:
                split_chinese_pinyin(ex_s, entry['traditional'], print_debug=True)
            except Exception as e:
                print(entry["traditional"], ex_s["chinese"], ex_s["pinyin"])
                print(e)
                break

Trad word: 故
Chinese list: ['親']
Pinyin list: ['qīng']
Pinyin remaining: ù
Possible Pinyins: ['gù', 'gu', 'shi']
Trad word: 故
Chinese list: ['親']
Pinyin list: ['qīng']
Pinyin remaining: ù
Possible Pinyins: ['gù', 'gu', 'shi']
故 親故 qīngù
Could not match pinyin for Chinese character '故' at position 1


In [71]:
import random

errors = []
for entry in flashcard_entries:
    segments = label_segments(entry['definition'])
    example_sentences = [segment for segment in segments if segment['label'] == 'example_sentence']
    for ex_s in example_sentences:
        try:
            split_chinese_pinyin(ex_s, entry['traditional'])
        except Exception:
            errors.append(entry['traditional'])
            print(entry["traditional"], ex_s["chinese"], ex_s["pinyin"])
            break

故 親故 qīngù
令 (散曲) 


In [62]:
"！" == "!"

False

In [None]:
參考
升級
故

# Format Entries

In [22]:
def fmt_entry(entry):
    traditional = entry.get("traditional", "")
    simplified = entry.get("simplified", "")
    pinyin = entry.get("pinyin", "")
    definition = entry.get("definition", "")
    simplified_hint = f"〔{simplified}〕" if traditional != simplified else ""

    formatted_back = ""
    formatted_back += f'<div align="left"><p><span style="font-size:32px">{traditional}{simplified_hint}</span><br/>\n'
    formatted_back += '<span style="color:#B4B4B4;"><b><span style="font-size:0.80em;">PY </span></b></span>'

    # pinyin
    for p in pinyin:
        starters = ["//", " ", "-", "→"]
        while any([p.startswith(s) for s in starters]):
            for s in starters:
                if p.startswith(s):
                    formatted_back += f'<span style="font-weight:600;">{s}</span>'
                    p = p.replace(s, "", 1)
        formatted_back += f'<span style="color:{get_pinyin_color(p)};"><span style="font-weight:600;">{p}</span></span>'
    
    # part of speech
    formatted_back += '</p>\n</div><div align="left"><p>'

    return formatted_back

# Grade Entries

In [24]:

def grade_fmt_entry(flashcard_entries, n_error_char_show=10, to_drop=[]):
    flashcard_entries = drop(flashcard_entries, to_drop)
    correct_count = 0
    length_diff_count = 0
    wrong_count = 0
    for i, entry in enumerate(flashcard_entries):
        expected = entry['formatted_back']
        expected = expected.replace(' ;=""', ";")
        expected = reorder_nested_spans(expected)
        result = fmt_entry(entry)
        
        if expected != result:
            for j in range(min(len(expected), len(result))):
                if expected[j] != result[j]:
                    print(f"Entry {i} differs at character {j}:")
                    print(f"Exp: {repr(expected[j:j+n_error_char_show])}...")
                    print(f"Got: {repr(result[j:j+n_error_char_show])}...")
                    wrong_count += 1
                    break
                    # return
            else:
                length_diff_count += 1
                j = len(result)
                print(f"{i}: {repr(expected[j:j+n_error_char_show])}...")
                print(repr(entry['definition']))

                continue
        else:
            correct_count += 1
    print(f"Total correct entries: {correct_count}")
    print(f"Total wrong entries: {wrong_count}")
    print(f"Total entries differing only in length: {length_diff_count}")

In [None]:
grade_fmt_entry(flashcard_entries, 1000, to_drop=to_drop)

0: '<b><span style="font-size:0.80em;"><span style="color:#B4B4B4;">MEASURE WORD</span></span></b><br/>\n[for boats or ships]<br/>\n</p>\n<blockquote style="border-left: 2px solid #0078c3; margin-left: 3px; padding-left: 1em; margin-top: 0px; margin-bottom: 0px;"><p><span style="color:#0078C3;">兩</span><span style="color:#0078C3;"><b>艘</b></span><span style="color:#0078C3;">油船</span><br/>\n<span style="font-weight:600;">Liǎng </span><b>sōu</b><span style="font-weight:600;"> yóuchuán</span><br/>\ntwo tankers<br/>\n</p>\n</blockquote>\n<blockquote style="border-left: 2px solid #0078c3; margin-left: 3px; padding-left: 1em; margin-top: 0px; margin-bottom: 0px;"><p><span style="color:#0078C3;">一</span><span style="color:#0078C3;"><b>艘</b></span><span style="color:#0078C3;">飛船</span><br/>\n<span style="font-weight:600;">Yī </span><b>sōu</b><span style="font-weight:600;"> fēichuán</span><br/>\na spaceship<br/>\n</p>\n</blockquote>\n<blockquote style="border-left: 2px solid #0078c3; margin-lef