In [1]:
import logging
from pathlib import Path

import pandas as pd
from tqdm.notebook import tqdm

import createJLPTDeck

In [2]:
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)s: %(message)s")

tqdm.pandas()

In [11]:
createJLPTDeck.setup()

# Load dictionary from json
logging.info("Loading jmdict from zipped json...")
jmdict, jmdict_tags_mapping = createJLPTDeck.load_jmdict_json_zip(Path(f"original_data/jmdict-eng-3.6.1.zip"))

# Load JLPT-by-level from .csv(s)
logging.info("Loading JLPT words from csvs...")
df = createJLPTDeck.extract_jlpt_csvs_from_folder(Path("original_data"))

# Transform/clean these csvs for use
# logging.info("Transforming data	...")
# df = createJLPTDeck.transform(df, jmdict, jmdict_tags_mapping)

# # Transform/prepare the dataframe for use as anki flashcards
# logging.info("Finalising for anki...")
# # load(df)


2025-11-26 18:48:59,767 INFO: Loading jmdict from zipped json...
2025-11-26 18:48:59,768 INFO: Extraction skipped; jmdict json already exists: original_data/jmdict-eng-3.6.1.json
2025-11-26 18:49:01,496 INFO: Loading JLPT words from csvs...


In [12]:
def transform(df: pd.DataFrame, jmdict: pd.DataFrame, jmdict_tags_mapping: dict) -> pd.DataFrame:
	rdf = df.copy()

	rdf = createJLPTDeck.clean(rdf)

	# Use the data in the .CSVs to look up words in the dictionary. Return a new dataframe with the new information
	df_lookup = rdf.apply(lambda x: createJLPTDeck.lookup_dict(x["jmdict_seq"], jmdict), axis=1, result_type="expand")
	# Join the original csv with the dictionary information
	rdf = pd.concat([rdf, df_lookup], axis=1)

	rdf = createJLPTDeck.prepare_word_record(rdf, jmdict_tags_mapping)

	# rdf = finalise(rdf)

	return rdf


In [13]:
df = transform(df, jmdict, jmdict_tags_mapping)
df

2025-11-26 18:49:01,512 DEBUG: Duplicated jmdict_seq rows dropped:
2025-11-26 18:49:01,512 DEBUG: 24      1483185
172     1390020
188     1578150
243     1004500
271     1579470
         ...   
8089    1606950
8108    1025690
8189    1076120
8282    1144860
8290    1146810
Name: jmdict_seq, Length: 532, dtype: int64


Unnamed: 0,jmdict_seq,kana,kanji,waller_definition,jlpt_level,reading_kanji,reading_kana,english_definition,grammar,additional,misc,reduced_additional,usually_kana,reading,formality,expression,tags
0,1198180,あう,会う,to meet,N5,会う,あう,"to meet, to encounter, to see","Godan verb with 'u' ending, intransitive verb","[[to have an accident, to have a bad experience]]",[],"to have an accident, to have a bad experience",False,会[あ]う,[],会う,[]
1,1381380,あお,青,blue,N5,青,あお,"blue, azure","noun, nouns which may take the genitive case p...",[[green]],[],green,False,青[あお],[],青,[]
2,1381390,あおい,青い,blue,N5,青い,あおい,"blue, azure",adjective (keiyoushi),"[[green], [pale (facial color), gray, grey], [...",[],"green, pale (facial color), gray, grey, unripe...",False,青[あお]い,[],青い,[]
3,2013900,あか,赤,red,N5,赤,あか,"red, crimson, scarlet","noun, nouns which may take the genitive case p...",[],[],,False,赤[あか],[],赤,[]
4,1383240,あかい,赤い,red,N5,赤い,あかい,"red, crimson, scarlet, vermilion",adjective (keiyoushi),"[[Red, communist]]",[],communist,False,赤[あか]い,[],赤い,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8287,1146160,レンタカー,,hire car (lit: rent-a-car),N1,,レンタカー,"rental car, rent-a-car, hire car, hired car",noun,"[[car rental agency, car hire agency, hire car...",[],"car rental agency, car hire agency, hire car a...",False,レンタカー,[],レンタカー,[]
8288,1146230,レントゲン,,X-ray (lit: Roentgen),N1,,レントゲン,X-ray,noun,[[roentgen (unit of ionizing radiation)]],[abbr],roentgen (unit of ionizing radiation),False,レントゲン,[],レントゲン,[]
8289,1146750,ロープ,,rope,N1,,ロープ,rope,noun,[],[],,False,ロープ,[],ロープ,[]
8291,1148010,ロマンチック,,romantic,N1,,ロマンチック,romantic,adjectival nouns or quasi-adjectives (keiyodoshi),[],[],,False,ロマンチック,[],ロマンチック,[]


In [14]:
import urllib

from bs4 import BeautifulSoup

In [15]:
def getAudio(wordKanji: str, wordKana: str, save_path: Path) -> bool:
    """Download audio from Jisho.org for word

    Args:
                    wordKanji (string): Kanji for the word
                    wordKana (string): kana for the word
                    save_path (string): Where to save the audio
    Returns:
                    bool: whether word mp3 is saved in directory (not necessarily donwloading if it already exists)
    """

    if save_path.is_file():
        logging.debug(f"{wordKana}/{wordKanji} already exists")
        return True

    logging.debug(f"Attempting to download {wordKana}/{wordKanji}")

    baseUrl = "https://jisho.org/search/"
    # search using both kanji and kana to ensure first result is desired
    search = (
        baseUrl + urllib.parse.quote(wordKanji) + "%20" + urllib.parse.quote(wordKana)
    )

    # get url page into a useable format
    try:
        page = urllib.request.urlopen(search).read()
    except:
        return False
    # soup = BeautifulSoup(page, features="lxml")
    soup = BeautifulSoup(page)
    audiotag = soup.find("audio")
    # ensure it is of the first result
    if (audiotag) and (audiotag.find_parent("div", {"class": "exact_block"})):
        audioUrl = audiotag.find("source").get(
            "src"
        )  # assume audio would be first, if present
        urllib.request.urlretrieve(
            "http:" + audioUrl, save_path
        )  # source in webpage lacks "http:" prefix
        logging.info(f"audio for {wordKana}/{wordKanji} downloaded and saved in {save_path}")
        return True
    else:
        logging.debug(f"Failed to download {wordKana}/{wordKanji}")
        # Note word as failed- so can speed up next time by not checking
        # with open(excludeFile, "a", encoding="utf-8") as f:
        #     f.write(wordKanji + ".mp3\n")
        return False

In [16]:
def get_failed_audio_downloads():
    failed_audio_path = Path("output/jisho_audio", "failed.csv")
    if not failed_audio_path.is_file():
        return
    return pd.read_csv(failed_audio_path, header=None, names = ["jmdict_seq", "reading_kanji", "reading_kana"])

In [17]:
failed_df = get_failed_audio_downloads()
failed_df

Unnamed: 0,jmdict_seq,reading_kanji,reading_kana
0,1381380,青,あお
1,1383240,赤い,あかい
2,1584640,明後日,あさって
3,1000320,彼処,あそこ
4,1483185,彼方,あちら
...,...,...,...
619,1221770,気の毒,きのどく
620,1591400,寄付,きふ
621,1609660,決まり,きまり
622,1222640,気味,きみ


In [18]:
df_matched = df.merge(failed_df, on=["jmdict_seq", "reading_kanji", "reading_kana"], how="left", indicator=True)
df["failed_jisho_audio"] = df_matched["_merge"] == "both"

In [None]:
for _, row in tqdm(df.iterrows(), total=df.shape[0]):   
    if row["failed_jisho_audio"] == True:
        logging.debug(f"{row["reading_kanji"]}/{row["reading_kana"]} already failed")
        continue
    
    audio_path = Path("output/jisho_audio", row["reading_kanji"]).with_suffix(".mp3")

    success = getAudio(row["reading_kanji"], row["reading_kana"], audio_path)
    if success:
        row["jisho_audio"] = audio_path
    else:
        with open(Path("output/jisho_audio", "failed.csv"), "a") as f:
            f.write(f"{row['jmdict_seq']},{row['reading_kanji']},{row['reading_kana']}\n")

  0%|          | 0/7747 [00:00<?, ?it/s]

2025-11-26 18:49:09,859 DEBUG: あう/会う already exists
2025-11-26 18:49:09,860 DEBUG: 青/あお already failed
2025-11-26 18:49:09,860 DEBUG: あおい/青い already exists
2025-11-26 18:49:09,861 DEBUG: あか/赤 already exists
2025-11-26 18:49:09,861 DEBUG: 赤い/あかい already failed
2025-11-26 18:49:09,861 DEBUG: あかるい/明るい already exists
2025-11-26 18:49:09,862 DEBUG: あき/秋 already exists
2025-11-26 18:49:09,863 DEBUG: あく/開く already exists
2025-11-26 18:49:09,864 DEBUG: あける/開ける already exists
2025-11-26 18:49:09,865 DEBUG: あげる/上げる already exists
2025-11-26 18:49:09,866 DEBUG: あさ/朝 already exists
2025-11-26 18:49:09,869 DEBUG: あさごはん/朝ごはん already exists
2025-11-26 18:49:09,871 DEBUG: 明後日/あさって already failed
2025-11-26 18:49:09,871 DEBUG: あし/足 already exists
2025-11-26 18:49:09,873 DEBUG: あした/明日 already exists
2025-11-26 18:49:09,874 DEBUG: 彼処/あそこ already failed
2025-11-26 18:49:09,876 DEBUG: あそぶ/遊ぶ already exists
2025-11-26 18:49:09,876 DEBUG: あたたかい/暖かい already exists
2025-11-26 18:49:09,878 DEBUG: あたま/頭 already 

In [45]:
df.head(3)

Unnamed: 0,jmdict_seq,kana,kanji,waller_definition,jlpt_level,reading_kanji,reading_kana,english_definition,grammar,additional,misc,reduced_additional,usually_kana,reading,formality,expression,tags
0,1198180,あう,会う,to meet,N5,会う,あう,"to meet, to encounter, to see","Godan verb with 'u' ending, intransitive verb","[[to have an accident, to have a bad experience]]",[],"to have an accident, to have a bad experience",False,会[あ]う,[],会う,[]
1,1381380,あお,青,blue,N5,青,あお,"blue, azure","noun, nouns which may take the genitive case p...",[[green]],[],green,False,青[あお],[],青,[]
2,1381390,あおい,青い,blue,N5,青い,あおい,"blue, azure",adjective (keiyoushi),"[[green], [pale (facial color), gray, grey], [...",[],"green, pale (facial color), gray, grey, unripe...",False,青[あお]い,[],青い,[]
