In [1]:
import ast
import re

import pandas as pd
import numpy as np

In [2]:
json_file = ".cache/jlpt-n5.json"
df_data = pd.read_json(json_file)
df_data

Unnamed: 0,slug,is_common,tags,jlpt,japanese,senses,attribution
0,学校,True,[wanikani7],[jlpt-n5],"[{'word': '学校', 'reading': 'がっこう'}]","[{'english_definitions': ['school'], 'parts_of...","{'jmdict': True, 'jmnedict': True, 'dbpedia': ..."
1,川,True,"[wanikani1, wanikani26]","[jlpt-n3, jlpt-n5]","[{'word': '川', 'reading': 'かわ'}, {'word': '川',...","[{'english_definitions': ['river', 'stream'], ...","{'jmdict': True, 'jmnedict': True, 'dbpedia': ..."
2,手,True,[wanikani2],[jlpt-n5],"[{'word': '手', 'reading': 'て'}]","[{'english_definitions': ['hand', 'arm'], 'par...","{'jmdict': True, 'jmnedict': True, 'dbpedia': ..."
3,戸,True,[wanikani3],[jlpt-n5],"[{'word': '戸', 'reading': 'と'}, {'word': '門', ...",[{'english_definitions': ['door (esp. Japanese...,"{'jmdict': True, 'jmnedict': True, 'dbpedia': ..."
4,眼鏡,True,[wanikani34],"[jlpt-n1, jlpt-n5]","[{'reading': 'メガネ'}, {'word': '眼鏡', 'reading':...","[{'english_definitions': ['glasses', 'eyeglass...","{'jmdict': True, 'jmnedict': True, 'dbpedia': ..."
...,...,...,...,...,...,...,...
652,コップ,True,[],[jlpt-n5],"[{'word': '洋杯', 'reading': 'コップ'}, {'word': '洋...",[{'english_definitions': ['glass (drinking ves...,"{'jmdict': True, 'jmnedict': False, 'dbpedia':..."
653,フォーク,True,[],[jlpt-n5],"[{'reading': 'フォーク'}, {'reading': 'ホーク'}]","[{'english_definitions': ['fork'], 'parts_of_s...","{'jmdict': True, 'jmnedict': False, 'dbpedia':..."
654,ラジカセ,True,[],[jlpt-n5],[{'reading': 'ラジカセ'}],[{'english_definitions': ['radio-cassette play...,"{'jmdict': True, 'jmnedict': False, 'dbpedia':..."
655,マッチ,True,[],[jlpt-n5],[{'reading': 'マッチ'}],"[{'english_definitions': ['match (contest)'], ...","{'jmdict': True, 'jmnedict': False, 'dbpedia':..."


In [3]:
def extract_word_safe(val):
    if isinstance(val, list) and len(val) > 0:
        first = val[0]
        if isinstance(first, dict):
            return first.get('word', None)  # Use get to avoid KeyError
    return None

In [4]:
def filter_english_definitions(senses):
    """
    Coalates the all but the first english_definitions together. Ignores the definition if tagged as 'place' or 'wikipedia definition', as they seem to have worse definitions.
    Remove duplicate entries.
    Limits total entries to not have too much info.
    """
    letter_limit = 100 # How many characters to have
    first_defs = set(defn.lower() for defn in senses[0].get('english_definitions', []))
    
    # Use the rest of the english definitions, without repeating those
    filtered_defs = []
    seen = set()  # To track duplicates (case-insensitive).

    for sense in senses[1:]:
        # Check if parts_of_speech contains neither "Place" nor "Wikipedia definition"
        if any(pos in ["Place", "Wikipedia definition"] for pos in sense.get("parts_of_speech", [])):
            continue
        
        for defn in sense.get("english_definitions", []):
            defn_lower = defn.lower()
            # Add if not in first sense and not already seen
            if defn_lower not in first_defs and defn_lower not in seen:
                filtered_defs.append(defn)
                seen.add(defn_lower)
    # Limit the total letters.
    letter_count = 0
    for i in range(len(filtered_defs)):
        single_def = filtered_defs[i]
        letter_count += len(single_def)
        if letter_count > letter_limit:
            filtered_defs = filtered_defs[:i]
            break
            
    return ", ".join(filtered_defs)

In [5]:
def extractFormality(senses):
	"""
	Extracts the formality tags from a string array

	Args:
					senses: senses section of the jlpt word info
	"""
	tags = senses[0]["tags"]
	# a list of pairs. The first is the entry to accept. The latter is what will be provided into the final formality string
	accept = {
		"Humble (kenjougo) language": "humble/kenjougo",
		"Honorific or respectful (sonkeigo) language": "respectful/sonkeigo",
		"Polite (teineigo) language": "polite/teineigo",
	}
	formalities = []
	for t in tags:
		if t in accept:
			formalities.append(accept[t])
	return " ".join(formalities)


In [6]:
def make_furigana(kanji: str, kana: str) -> str:
	"""Generate a furigana word from associated kanji and kana. Is able to handle words with kana between the kanji.

	E.g. (掃除する, そうじする) becomes 掃除[そうじ]する

	Args:
					kanji (string): Kanji of the word (can include kana as well).
					kana (string): Kana of the word

	Returns:
					string: Kanji word with furigana
	"""
	# No value provided
	if not kana:
		return
    # no kanji provided
	if not kanji:
		return kana
	# what to put the furigana inside
	f_l = "["
	f_r = "]"

	# keep track of extra character spaces that are 'eaten' by kanjis
	tt = 0
	# furigana-kanji lists
	outWord = ""
	lastMatchLoc = 0
	fk = []
	# for each kanji in the word
	if kanji:
		for m in re.finditer("[一-龯々]+", kanji):
			kanjiWordPos = m.span()[0]
			kanaWordPos = kanjiWordPos + tt

			# find the next furigana(s) in the kanji word
			searchLoc = m.span()[1]
			m2 = re.search(r"[ぁ-んァヿ]+", kanji[searchLoc:])
			if m2:
				# find this kana match in the kana word
				searchLoc = searchLoc + tt
				m3 = re.search(m2.group(), kana[searchLoc:])
				# if no matching found, assume something wrong with the input
				if not m3:
					return ""

				# get the kana between these
				s = kana[kanaWordPos : searchLoc + m3.span()[0]]

				# update number of kanas 'eaten' by kanjis
				tt = tt + m3.span()[0]

			else:
				s = kana[kanaWordPos:]

			# the furigana'd kanji string, separated by space
			out = " " + m.group() + f_l + s + f_r
			outWord = outWord + kanji[lastMatchLoc:kanjiWordPos] + out
			fk.append(out)

			# update position of last kanji searched
			lastMatchLoc = m.span()[1]

	# update the out word for tailing kanas
	outWord = outWord + kanji[lastMatchLoc:]
	return outWord.strip()


In [7]:
columns_as_tags = ['usually_kana', 'jlpt', 'formality']

cols = [
        "slug",
        "expression",
        "reading",
        "english_definition",
        "grammar",
        "additional",
        "tags", # the card tags. Should be kept as the last column for Anki
    ]
df = pd.DataFrame(columns=cols)

df["slug"] = df_data["slug"]
df["english_definition"] = df_data["senses"].apply(
    lambda x: ", ".join(x[0]["english_definitions"])
)
df["expression"] = df_data["japanese"].apply(extract_word_safe)
df["japanese_reading"] = df_data["japanese"].apply(
    lambda x: x[0]["reading"]
)
df["grammar"] = df_data["senses"].apply(
    # remove text from () and []
    lambda x: re.sub("[\(\[].*?[\)\]]", "", ", ".join(x[0]["parts_of_speech"]))
)
df["additional"] = df_data["senses"].apply(filter_english_definitions)



df['jlpt'] = df_data["jlpt"].apply(
    lambda x: ', '.join(x)
)
df["usually_kana"] = df_data["senses"].apply(
    lambda x: "usually_kana"
        if ("Usually written using kana alone" in x[0]["tags"]) else ""
)
# formality of the word, append to jlpt tags info
df["formality"] = df_data["senses"].apply(extractFormality)

df["reading"] = df.apply(
    lambda row: row["japanese_reading"] if row["usually_kana"]=="usually_kana" else make_furigana(row["expression"], row["japanese_reading"]),
    axis=1
)
df = df.drop(["japanese_reading"], axis=1)

# make empty entries nan
df["tags"] = df[columns_as_tags].replace(r'^\s*$', np.nan, regex=True).apply(
    lambda row: ', '.join(row.dropna().astype(str)),
    axis=1
)

# Drop any words that are purely english. Words like ＰＥＴ
english_pattern = re.compile(r'^[A-Za-zＡ-Ｚａ-ｚ]+$')
df = df[~df["expression"].str.contains(english_pattern, regex=True, na=False)]

df = df.drop(columns_as_tags, axis=1)
df

Unnamed: 0,slug,expression,reading,english_definition,grammar,additional,tags
0,学校,学校,学校[がっこう],school,Noun,,jlpt-n5
1,川,川,川[かわ],"river, stream",Noun,the ... river,"jlpt-n3, jlpt-n5"
2,手,手,手[て],"hand, arm",Noun,"forepaw, foreleg, handle, worker, help, troubl...",jlpt-n5
3,戸,戸,戸[と],door (esp. Japanese-style),Noun,"shutter, window shutter, entrance (to a home),...",jlpt-n5
4,眼鏡,,メガネ,"glasses, eyeglasses, spectacles",Noun,"judgment, judgement, discrimination, discernme...","usually_kana, jlpt-n1, jlpt-n5"
...,...,...,...,...,...,...,...
651,コート,,コート,coat,Noun,coating,jlpt-n5
652,コップ,洋杯,コップ,"glass (drinking vessel), tumbler",Noun,cups (suit),"usually_kana, jlpt-n5"
653,フォーク,,フォーク,fork,Noun,forkball,jlpt-n5
654,ラジカセ,,ラジカセ,radio-cassette player,Noun,,jlpt-n5


In [8]:
df["expression"] = df[df["expression"].isnull()]["reading"]
df

Unnamed: 0,slug,expression,reading,english_definition,grammar,additional,tags
0,学校,,学校[がっこう],school,Noun,,jlpt-n5
1,川,,川[かわ],"river, stream",Noun,the ... river,"jlpt-n3, jlpt-n5"
2,手,,手[て],"hand, arm",Noun,"forepaw, foreleg, handle, worker, help, troubl...",jlpt-n5
3,戸,,戸[と],door (esp. Japanese-style),Noun,"shutter, window shutter, entrance (to a home),...",jlpt-n5
4,眼鏡,メガネ,メガネ,"glasses, eyeglasses, spectacles",Noun,"judgment, judgement, discrimination, discernme...","usually_kana, jlpt-n1, jlpt-n5"
...,...,...,...,...,...,...,...
651,コート,コート,コート,coat,Noun,coating,jlpt-n5
652,コップ,,コップ,"glass (drinking vessel), tumbler",Noun,cups (suit),"usually_kana, jlpt-n5"
653,フォーク,フォーク,フォーク,fork,Noun,forkball,jlpt-n5
654,ラジカセ,ラジカセ,ラジカセ,radio-cassette player,Noun,,jlpt-n5


In [12]:
df.iloc[655].index

Index(['slug', 'expression', 'reading', 'english_definition', 'grammar',
       'additional', 'tags'],
      dtype='object')

In [10]:
make_furigana(df.iloc[656]["word"], df.iloc[656]["reading"])

IndexError: single positional indexer is out-of-bounds

In [None]:
df_data["senses"].apply(
    lambda x: x[0]["info"]
)