# Utils

## Settings

In [7]:
import sys
import time
from typing import Tuple
import requests
from tqdm import tqdm
from datetime import datetime
from joblib import Parallel, delayed
import os
import json
import re
from joblib import Parallel, delayed

from utils.scraper import get_content, extract_ipa_for_language
from utils.file import save
from config import OUTPUT_DIRECTORY, ANKI_CONNECT_URL, DECK_NAME, LANGUAGE, VOCAB_FIELD, N_CORES, N_JOBS_EXTRACT, N_JOBS_UPDATE, DATE_FORMAT

TEST_PARSING = True
TEST_CREATING = True
TEST_UPDATING = True
TEST_RETRY_UPDATING = True

## Connection Functions

In [5]:
def request_anki(action, **params):
    request = {'action': action, 'params': params, 'version': 6}
    response = requests.post(ANKI_CONNECT_URL, json=request).json()
    if response.get('error'):
        raise Exception(response['error'])
    return response['result']

def fetch_all_deck_names():
    try:
        deck_names = request_anki('deckNames')
        print("Deck names:", deck_names)
        return deck_names
    except Exception as e:
        print(f"Error fetching deck names: {e}")
        return []
    
# Parse dates from filenames and find most recent
def parse_date(filename):
    # Extract date string from "anki@YYYYMMDD-HHMMSS.json" or "after_anki@YYYYMMDD-HHMMSS.json"
    pattern = re.compile(r"\w+@(\d{8}-\d{6}).json")
    match = pattern.search(filename)
    if match:
        date_str = match.group(1)
        return datetime.strptime(date_str, "%Y%m%d-%H%M%S")
    else:
        return None

## Util Functions

In [3]:
def get_vocab(note):
    return note['fields'][VOCAB_FIELD]['value'].strip().strip("-")

def get_ipa(note):
    return note['fields']['IPA']['value'].strip()


def load_anki_json(filename: str):
    if not (filename.startswith("anki@") or filename.startswith("after_anki@")) and not filename.endswith(".json"):
        raise ValueError(f"Invalid filename: {filename}")

    original_time = parse_date(filename).strftime(DATE_FORMAT)

    if original_time is None:
        raise ValueError(f"Invalid filename for parsing timestamp: {filename}")

    # Load and return the file contents
    with open(os.path.join(OUTPUT_DIRECTORY, filename), 'r', encoding='utf-8') as f:
        data = json.load(f)
        return data, original_time

def load_most_recent_anki_json():
    """Load the most recent anki json file from the outputs directory
    
    Returns:
        dict: The contents of the most recent anki json file
    """
    # List all anki json files in the output directory
    anki_files = [f for f in os.listdir(OUTPUT_DIRECTORY) if f.startswith("anki@") and f.endswith(".json")]
    
    if not anki_files:
        raise FileNotFoundError("No anki json files found in outputs directory")
    
    most_recent = max(anki_files, key=parse_date)
    print(f"Loading most recent file: {most_recent}")

    return load_anki_json(most_recent)

## Core Functions

In [None]:
def fetch_words_to_update(debug: bool = False, from_to: Tuple[int, int] = (0, 10_000), verbose: bool = False):
    # Fetch all Korean note IDs
    note_ids = request_anki('findNotes', query=f'deck:"{DECK_NAME}"')

    if debug:
        notes_info = request_anki('notesInfo', notes=note_ids[:10])
    else:
        notes_info = request_anki('notesInfo', notes=note_ids[from_to[0]:from_to[1]])
    
    if verbose:
        print(f"notes: {notes_info}")

    # Keep only those with empty IPA field
    notes_info = [note for note in notes_info if get_ipa(note) == ""]
    
    # Return words
    words_ids = {get_vocab(note): (note['noteId'], None) for note in notes_info} # The tuple contains the note ID and the IPA

    return words_ids

def extract_word_ipa__single(word, note_id, ipa):
    """Process a single word and return the results"""
    try:
        web_content = get_content(word, save_response=False)
        if not web_content:
            print(f"Error fetching content for word {word}")
            return word, (note_id, ipa), None

        result = extract_ipa_for_language(web_content, LANGUAGE, word)
        if not result:
            print(f"Error extracting IPA for word {word}")
            return word, (note_id, ipa), None

        return word, (note_id, result), result
    except Exception as e:
        print(f"Error processing word {word}: {e}")
        return word, (note_id, ipa), None

def update_card_ipa__single(word: str, note_id: int, ipa: str, extra_ipa: bool):
    """Update a single note's IPA fields"""
    try:
        # Leave the extra-IPA field empty in case
        if extra_ipa == True:
            updated_note = {
                'id': note_id,
                'fields': {
                    'IPA': ipa,
                    'Extra-IPA': "True",
                }
            }
        elif extra_ipa == False:
            updated_note = {
                'id': note_id,
                'fields': {
                    'IPA': ipa,
                }
            }
        else:
            raise ValueError(f"Invalid value for extra_ipa for {word}: should be either True or False, but is {extra_ipa}")

        # Update the note
        request_anki('updateNoteFields', note=updated_note)
        return word, True, None  # Success
    except Exception as e:
        return word, False, str(e)  # Error

# Main

## Parser Test

In [5]:
# possible words: "책", "저", "libro", "놀다", "오다", "돈", "돌"

if TEST_PARSING:
    word = "-되다"
    content = get_content(word, save_response=True)

    # ipa = extract_ipa_for_language(content, "italian", word)
    # if ipa:
    #     print(ipa)

    ipa = extract_ipa_for_language(content, "korean", word)
    if ipa:
        print(ipa)

Data saved to ./outputs/-되다_response_text_content.html as HTML
Korean section not found: -되다


## Anki

### Fetch And Generate

In [None]:
if TEST_CREATING:
    # Fetch words
    words_ids = fetch_words_to_update()
    print(f"Words to update: {len(words_ids)}")

    # Process words in parallel
    results = Parallel(n_jobs=N_JOBS_EXTRACT)(
        delayed(extract_word_ipa__single)(word, note_id, ipa) 
        for word, (note_id, ipa) in tqdm(words_ids.items())
    )

    # Process results
    skipped_dict = {}
    updated_words = {}

    for word, (note_id, result), success in results:
        if not success:
            skipped_dict[word] = (note_id, result)
        else:
            try:
                ipa, extra_ipa = result
                updated_words[word] = {"note_id": note_id, "ipa": ipa, "extra_ipa": extra_ipa}
            except Exception as e:
                print(f"Error updating word {word}: {e}")
                skipped_dict[word] = (note_id, result)

    # Save the output
    output = {
        'skipped_words': skipped_dict,
        'updated_words': updated_words
    }

    current_time = datetime.now().strftime(DATE_FORMAT)
    save(output, f"anki@{current_time}.json")

    len(updated_words)

Words to update: 165


100%|██████████| 165/165 [00:07<00:00, 21.60it/s]


Data saved to ./outputs/anki@20250212-205800.json as JSON


### Update

In [None]:
if TEST_UPDATING:
    # Usage
    anki_json, original_time = load_most_recent_anki_json()
    updated_words = anki_json.get('updated_words', {})

    # Prepare the arguments for parallel processing
    args = [(word, info['note_id'], info['ipa'], info['extra_ipa']) 
            for word, info in updated_words.items()]

    # Process in parallel with progress bar
    results = Parallel(n_jobs=N_JOBS_UPDATE)(
        delayed(update_card_ipa__single)(word, note_id, ipa, extra_ipa) 
        for word, note_id, ipa, extra_ipa in tqdm(args, desc="Updating IPAs")
    )

    # Process results
    success = []
    errors = []
    for word, status, error in results:
        if status:
            success.append(word)
        else:
            errors.append((word, error))

    # Save the output
    error_words = [word for word, error in errors]
    after_skipped_words = {word: info for word, info in updated_words.items() if word in error_words}
    after_updated_words = {word: info for word, info in updated_words.items() if word not in error_words}

    after_output = {
        "skipped_words": after_skipped_words,
        "updated_words": after_updated_words,
    }

    save(after_output, f"after_anki@{original_time}.json")

In [None]:
if TEST_RETRY_UPDATING:
    # Here we don't parallelize, since I only found errors originating from too many handles at the same time so far
    after_anki_json, _ = load_anki_json(f"after_anki@{original_time}.json")

    skipped_words = after_anki_json.get("skipped_words", {})

    args = [(word, info['note_id'], info['ipa'], info['extra_ipa']) 
            for word, info in skipped_words.items()]
    
    final_skipped_words = {}
    final_updated_words = {}

    for word, note_id, ipa, extra_ipa in tqdm(args, desc="Updating IPAs"):
        try:
            update_card_ipa__single(word, note_id, ipa, extra_ipa)
            final_updated_words[word] = {"note_id": note_id, "ipa": ipa, "extra_ipa": extra_ipa}
        except Exception as e:
            print(f"Error updating word {word}: {e}")
            final_skipped_words[word] = {"note_id": note_id, "ipa": ipa, "extra_ipa": extra_ipa}
            continue

    # Save the output
    final_output = {
        "skipped_words": final_skipped_words,
        "updated_words": final_updated_words,
    }

    save(final_output, f"final_anki@{original_time}.json")

In [9]:
after_anki_json, _ = load_anki_json(f"after_anki@20250217-171308.json")
if len(after_anki_json["skipped_words"]) == 0:
    print("There are still skipped words")
else :
    print("All words have been updated")
# skipped_words = after_anki_json.get("skipped_words", {})

There are still skipped words
