In [165]:
import json
import os
import re

import pandas as pd


In [137]:
with open("original_data/jmdict-eng-3.6.1.json", "r") as f:
    data = json.load(f)

jmdict = pd.DataFrame(data["words"])
jmdict

Unnamed: 0,id,kanji,kana,sense
0,1000000,[],"[{'common': False, 'text': 'ヽ', 'tags': [], 'a...","[{'partOfSpeech': ['unc'], 'appliesToKanji': [..."
1,1000010,[],"[{'common': False, 'text': 'ヾ', 'tags': [], 'a...","[{'partOfSpeech': ['unc'], 'appliesToKanji': [..."
2,1000020,[],"[{'common': False, 'text': 'ゝ', 'tags': [], 'a...","[{'partOfSpeech': ['unc'], 'appliesToKanji': [..."
3,1000030,[],"[{'common': False, 'text': 'ゞ', 'tags': [], 'a...","[{'partOfSpeech': ['unc'], 'appliesToKanji': [..."
4,1000040,"[{'common': False, 'text': '〃', 'tags': []}]","[{'common': False, 'text': 'おなじ', 'tags': [], ...","[{'partOfSpeech': ['n'], 'appliesToKanji': ['*..."
...,...,...,...,...
214164,5747433,"[{'common': False, 'text': '株探', 'tags': []}]","[{'common': False, 'text': 'かぶたん', 'tags': [],...","[{'partOfSpeech': ['n'], 'appliesToKanji': ['*..."
214165,5747436,"[{'common': False, 'text': 'Ｐｉｘ', 'tags': []},...","[{'common': False, 'text': 'ピックス', 'tags': [],...","[{'partOfSpeech': ['n'], 'appliesToKanji': ['*..."
214166,5747446,"[{'common': False, 'text': '日本専門医機構', 'tags': ...","[{'common': False, 'text': 'にほんせんもんいきこう', 'tag...","[{'partOfSpeech': ['n'], 'appliesToKanji': ['*..."
214167,5747447,"[{'common': False, 'text': '日本歯科専門医機構', 'tags'...","[{'common': False, 'text': 'にほんしかせんもんいきこう', 't...","[{'partOfSpeech': ['n'], 'appliesToKanji': ['*..."


In [138]:
jmdict["id"] = jmdict["id"].astype(int)

In [264]:
# What the short tags used in the dict mean
jmdict_tags_mapping = data["tags"]
# custom changes:
jmdict_tags_mapping["n"] = "noun"
jmdict_tags_mapping

{'v5uru': 'Godan verb - Uru old class verb (old form of Eru)',
 'v2g-s': "Nidan verb (lower class) with 'gu' ending (archaic)",
 'dei': 'deity',
 'ship': 'ship name',
 'leg': 'legend',
 'bra': 'Brazilian',
 'music': 'music',
 'quote': 'quotation',
 'pref': 'prefix',
 'ktb': 'Kantou-ben',
 'rK': 'rarely used kanji form',
 'derog': 'derogatory',
 'abbr': 'abbreviation',
 'exp': 'expressions (phrases, clauses, etc.)',
 'astron': 'astronomy',
 'v2g-k': "Nidan verb (upper class) with 'gu' ending (archaic)",
 'aux-v': 'auxiliary verb',
 'ctr': 'counter',
 'surg': 'surgery',
 'baseb': 'baseball',
 'serv': 'service',
 'genet': 'genetics',
 'geogr': 'geography',
 'dent': 'dentistry',
 'v5k-s': 'Godan verb - Iku/Yuku special class',
 'horse': 'horse racing',
 'ornith': 'ornithology',
 'v2w-s': "Nidan verb (lower class) with 'u' ending and 'we' conjugation (archaic)",
 'sK': 'search-only kanji form',
 'rk': 'rarely used kana form',
 'hob': 'Hokkaido-ben',
 'male': 'male term or language',
 'motor

In [141]:
def extract_addition_engl(entry):
    """
    Extract all the additional definitions for the word
    """
    adds = []
    # Every sense after the first
    for i in entry["sense"].iloc[0][1:]:
        accept = True
        for m in i["misc"]:
            # not an archaic usage, nor place name
            if m in ["arch", "place"]:
                accept = False
        # grammar usage is the same.
        # E.g. reject english definitions when the verb is intransitive rather than the primary definition usage
        if i["partOfSpeech"] != entry["sense"].iloc[0][0]["partOfSpeech"]:
            accept = False
        # Combine the texts together in an array
        if accept:
            adds.append([x["text"] for x in i["gloss"]])
    return adds

In [142]:
def lookup_dict(dict_id: int):
    """
    Using the jmdict in json form (https://github.com/scriptin/jmdict-simplified/) imported as a pandas dataframe.
    """
    print(dict_id)
    entry = jmdict[jmdict["id"] == dict_id]
    if len(entry) < 1:
        print(f"Not found {dict_id}")

    # Need .iloc[0][0] structure due to importing nested json into dataframe
    
    kanji = entry["kanji"].iloc[0][0]["text"] if len(entry["kanji"].iloc[0])  > 0 else ""
    kana = entry["kana"].iloc[0][0]["text"] if len(entry["kana"].iloc[0])  > 0 else ""

    additional = extract_addition_engl(entry)
            
    newdict = {
        # "expression": entry.kanji_forms[0],
        "reading_kanji": kanji,
        "reading_kana": kana,
        "english_definition": [x["text"] for x in entry["sense"].iloc[0][0]["gloss"]],
        "grammar": entry["sense"].iloc[0][0]["partOfSpeech"],
        "additional": additional,
        # "formality": entry["sense"].iloc[0][0]["misc"], #pol, hon,
        # "usually_kana": entry["sense"].iloc[0][0]["misc"], #uk
        "misc": entry["sense"].iloc[0][0]["misc"],
    }
    return newdict
# expression,reading,english_definition,grammar,additional,tags,japanese_reading

In [143]:
lookup_dict(2864817)

2864817


{'reading_kanji': '揚げる',
 'reading_kana': 'あげる',
 'english_definition': ['to deep-fry', 'to make deep-fried food'],
 'grammar': ['v1', 'vt'],
 'additional': [['to launch (fireworks, etc.)',
   'to hoist (e.g. a flag)',
   'to (let) fly (e.g. a kite)',
   'to set off'],
  ['to summon (for geishas, etc.)', 'to call in'],
  ['to land (e.g. a boat)', 'to come ashore', 'to move (something) onto land'],
  ['to suck up (water; e.g. a plant)', 'to absorb', 'to draw up']],
 'misc': []}

# Transform

In [144]:
def make_furigana(kanji: str, kana: str) -> str:
	"""Generate a furigana word from associated kanji and kana. Is able to handle words with kana between the kanji.

	E.g. (掃除する, そうじする) becomes 掃除[そうじ]する

	Args:
					kanji (string): Kanji of the word (can include kana as well).
					kana (string): Kana of the word
	Returns:
					string: Kanji word with furigana
	"""
	if not kana:
		assert(False, "No kana reading provided.")
		return
	if not kanji:
		return kana
	# what to put the furigana inside
	f_l = "["
	f_r = "]"

	# keep track of extra character spaces that are 'eaten' by kanjis
	tt = 0
	# furigana-kanji lists
	outWord = ""
	lastMatchLoc = 0
	fk = []
	# for each kanji in the word
	if kanji:
		# Search over kanji
		for m in re.finditer("[一-龯々]+", kanji):
			kanjiWordPos = m.span()[0]
			kanaWordPos = kanjiWordPos + tt

			# find the next furigana(s) in the kanji word
			searchLoc = m.span()[1]

			# Search over hiragana and katakana
			m2 = re.search(r"[ぁ-んァ-ヿ]+", kanji[searchLoc:])
			if m2:
				# find this kana match in the kana word
				searchLoc = searchLoc + tt
				m3 = re.search(m2.group(), kana[searchLoc:])
				# if no matching found, assume something wrong with the input
				if not m3:
					return ""

				# get the kana between these
				s = kana[kanaWordPos : searchLoc + m3.span()[0]]

				# update number of kanas 'eaten' by kanjis
				tt = tt + m3.span()[0]

			else:
				s = kana[kanaWordPos:]

			# the furigana'd kanji string, separated by space
			out = " " + m.group() + f_l + s + f_r
			outWord = outWord + kanji[lastMatchLoc:kanjiWordPos] + out
			fk.append(out)

			# update position of last kanji searched
			lastMatchLoc = m.span()[1]

	# update the out word for tailing kanas
	outWord = outWord + kanji[lastMatchLoc:]
	if outWord == "":
		logging.debug(f"Returning empty furigana-word for {kana}")
	return outWord.strip()


In [145]:
def transform_lookup(dictionary_dict):
    """
    Convert list entries into card-ready sentences/strings
    """

# Pipeline

In [146]:
def load_jlpt_csvs(folder_path):
    dfs = []
    
    for level in ["n5", "n4", "n3", "n2", "n1"]:
        csv_path = os.path.join(folder_path, f"{level}.csv")
        if os.path.exists(csv_path):
            df = pd.read_csv(csv_path)
            df["jlpt_level"] = level.upper()
            dfs.append(df)
    
    if dfs:
        merged_df = pd.concat(dfs, ignore_index=True)
        return merged_df
    else:
        return None


In [147]:
df = load_jlpt_csvs("original_data")
df

Unnamed: 0,jmdict_seq,kana,kanji,waller_definition,jlpt_level
0,1198180.0,あう,会う,to meet,N5
1,1381380.0,あお,青,blue,N5
2,1381390.0,あおい,青い,blue,N5
3,2013900.0,あか,赤,red,N5
4,1383240.0,あかい,赤い,red,N5
...,...,...,...,...,...
8288,1146230.0,レントゲン,,X-ray (lit: Roentgen),N1
8289,1146750.0,ロープ,,rope,N1
8290,1146810.0,ローマじ,ローマ字,"romanization, Roman letters",N1
8291,1148010.0,ロマンチック,,romantic,N1


In [148]:
df = df.dropna(subset=["jmdict_seq"])  # Drop any rows with NaN values
df["jmdict_seq"] = df["jmdict_seq"].astype(int) # convert from floats to ints
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,jmdict_seq,kana,kanji,waller_definition,jlpt_level
0,1198180,あう,会う,to meet,N5
1,1381380,あお,青,blue,N5
2,1381390,あおい,青い,blue,N5
3,2013900,あか,赤,red,N5
4,1383240,あかい,赤い,red,N5
...,...,...,...,...,...
8288,1146230,レントゲン,,X-ray (lit: Roentgen),N1
8289,1146750,ロープ,,rope,N1
8290,1146810,ローマじ,ローマ字,"romanization, Roman letters",N1
8291,1148010,ロマンチック,,romantic,N1


In [149]:
def transform_w(df):
    # Drop duplicates in jmdict_seq, keeping the lowest/easiest level (which comes first in the df)
    dupes = df[df.duplicated(subset="jmdict_seq", keep="first")]
    print("Duplicated rows dropped:")
    print(dupes["jmdict_seq"])
    df = df.drop(dupes.index)


    # fill NAN kanji with blank, since these values should be empty, it's not an error
    df["kanji"] = df["kanji"].fillna("")
    return df

In [150]:
df = transform_w(df)
df

Duplicated rows dropped:
24      1483185
172     1390020
188     1578150
243     1004500
271     1579470
         ...   
8089    1606950
8108    1025690
8189    1076120
8282    1144860
8290    1146810
Name: jmdict_seq, Length: 532, dtype: int64


Unnamed: 0,jmdict_seq,kana,kanji,waller_definition,jlpt_level
0,1198180,あう,会う,to meet,N5
1,1381380,あお,青,blue,N5
2,1381390,あおい,青い,blue,N5
3,2013900,あか,赤,red,N5
4,1383240,あかい,赤い,red,N5
...,...,...,...,...,...
8287,1146160,レンタカー,,hire car (lit: rent-a-car),N1
8288,1146230,レントゲン,,X-ray (lit: Roentgen),N1
8289,1146750,ロープ,,rope,N1
8291,1148010,ロマンチック,,romantic,N1


In [151]:
df_lookup = df.apply(lambda x: lookup_dict(x["jmdict_seq"]), axis=1, result_type="expand")

1198180
1381380
1381390
2013900
1383240
1532350
1332650
1586270
1202450
1352320
1428280
1586330
1584640
1404630
1584660
1000320
1542160
1586420
1582310
1361490
1483185
1343460
1467720
1275320
1269320
1223615
1249900
1307630
1000430
1017760
1547450
1218380
1213400
1584930
1171900
1153520
1390930
1296400
1514320
2847612
2820690
1605820
1583250
1587040
1191730
2845606
1578850
1219960
1219980
1421700
1159980
1157070
1519290
1432680
1160790
1576260
1165970
1188760
1268570
1163400
1268070
1188890
1258330
1288850
1156800
1524590
1587610
1582820
1577980
1546640
1465610
1357600
1587850
1352130
1269410
1475480
1193180
1588120
1378690
1201190
1473950
1481920
1580340
1202270
1173720
1173750
1174420
1001140
1175140
1030630
1178590
1486650
1407460
1588880
1412890
1414220
1002650
1001710
1001820
1223640
1421850
1179330
1329015
1299685
1002320
1236900
2261490
1180470
1421970
1002430
1002100
1002590
1581930
1419990
1420010
1576050
1576060
1414170
1002610
1451750
1001830
1001990
1002330
2261500
2220600


In [152]:
df_lookup

Unnamed: 0,reading_kanji,reading_kana,english_definition,grammar,additional,misc
0,会う,あう,"[to meet, to encounter, to see]","[v5u, vi]","[[to have an accident, to have a bad experience]]",[]
1,青,あお,"[blue, azure]","[n, adj-no]",[[green]],[]
2,青い,あおい,"[blue, azure]",[adj-i],"[[green], [pale (facial color), gray, grey], [...",[]
3,赤,あか,"[red, crimson, scarlet]","[n, adj-no]",[],[]
4,赤い,あかい,"[red, crimson, scarlet, vermilion]",[adj-i],"[[Red, communist]]",[]
...,...,...,...,...,...,...
8287,,レンタカー,"[rental car, rent-a-car, hire car, hired car]",[n],"[[car rental agency, car hire agency, hire car...",[]
8288,,レントゲン,[X-ray],[n],[[roentgen (unit of ionizing radiation)]],[abbr]
8289,,ロープ,[rope],[n],[],[]
8291,,ロマンチック,[romantic],[adj-na],[],[]


In [153]:
df = pd.concat([df, df_lookup], axis=1)
df

Unnamed: 0,jmdict_seq,kana,kanji,waller_definition,jlpt_level,reading_kanji,reading_kana,english_definition,grammar,additional,misc
0,1198180,あう,会う,to meet,N5,会う,あう,"[to meet, to encounter, to see]","[v5u, vi]","[[to have an accident, to have a bad experience]]",[]
1,1381380,あお,青,blue,N5,青,あお,"[blue, azure]","[n, adj-no]",[[green]],[]
2,1381390,あおい,青い,blue,N5,青い,あおい,"[blue, azure]",[adj-i],"[[green], [pale (facial color), gray, grey], [...",[]
3,2013900,あか,赤,red,N5,赤,あか,"[red, crimson, scarlet]","[n, adj-no]",[],[]
4,1383240,あかい,赤い,red,N5,赤い,あかい,"[red, crimson, scarlet, vermilion]",[adj-i],"[[Red, communist]]",[]
...,...,...,...,...,...,...,...,...,...,...,...
8287,1146160,レンタカー,,hire car (lit: rent-a-car),N1,,レンタカー,"[rental car, rent-a-car, hire car, hired car]",[n],"[[car rental agency, car hire agency, hire car...",[]
8288,1146230,レントゲン,,X-ray (lit: Roentgen),N1,,レントゲン,[X-ray],[n],[[roentgen (unit of ionizing radiation)]],[abbr]
8289,1146750,ロープ,,rope,N1,,ロープ,[rope],[n],[],[]
8291,1148010,ロマンチック,,romantic,N1,,ロマンチック,[romantic],[adj-na],[],[]


In [154]:
# Do some data checks
df[df["kana"] != df["reading_kana"]]

Unnamed: 0,jmdict_seq,kana,kanji,waller_definition,jlpt_level,reading_kanji,reading_kana,english_definition,grammar,additional,misc
290,1005900,じゃ,,well then…,N5,,じゃあ,"[then, well, so, well then]",[conj],[],[]
365,1597150,たばこ,,"tobacco,cigarettes",N5,煙草,タバコ,"[tobacco, cigarette, cigar]",[n],[[tobacco plant (Nicotiana tabacum)]],[uk]
575,1584350,まいげつ,毎月,every month,N5,毎月,まいつき,"[every month, each month, monthly]","[adv, n]",[],[]
579,1584360,まいねん,毎年,every year,N5,毎年,まいとし,"[every year, yearly, annually]","[adv, n]",[],[]
611,1202150,みんな,,everyone,N5,皆,みな,"[everyone, everybody, all]","[n, pn, adv]",[],[uk]
...,...,...,...,...,...,...,...,...,...,...,...
8104,1022990,インターフォン,,intercom,N1,,インターホン,[intercom],[n],[],[]
8112,1031600,オーケー,ＯＫ,OK,N1,ＯＫ,オッケー,"[OK, okay]","[int, n, vs, vt, vi]",[],[]
8121,1037790,カテゴリー,,category,N1,,カテゴリ,[category],[n],[],[]
8200,1077960,チャンネル,,a channel,N1,,チャネル,[channel],[n],[],[]


In [155]:
foo = df[df["kanji"] != df["reading_kanji"]]
foo = foo[~foo["misc"].apply(lambda lst: "uk" in lst)]
foo[foo["kana"] != foo["reading_kana"]]

Unnamed: 0,jmdict_seq,kana,kanji,waller_definition,jlpt_level,reading_kanji,reading_kana,english_definition,grammar,additional,misc
1820,2234080,けち,,"stinginess,miser,miserliness",N3,吝嗇,ケチ,"[stinginess, miserliness, penny-pinching, mise...","[n, adj-na]","[[petty, narrow-minded, mean-spirited, small-m...",[]
3653,1576030,さきおととい,,two days before yesterday,N2,一昨昨日,いっさくさくじつ,"[two days before yesterday, three days back (a...","[n, adv]",[],[]
5545,1577800,がんぶつ,贋物,"imitation, counterfeit, forgery, sham",N1,偽物,にせもの,"[spurious article, forgery, counterfeit, imita...",[n],[],[]
6729,1580300,たなごころ,掌,the palm,N1,手のひら,てのひら,[palm (of the hand)],[n],[],[]


In [244]:
def filter_english_definitions(additional_lst, primary_eng_defns) -> str:
    """
    Grabs all the additional english definitions of the word. 
    
    E.g. 川 has a primary definition of "river", and 1 additional meaning as "the *something* river". This function returns "the *something* river".

    Remove duplicate entries.
    Limits total entries to not have too much info.

    Returns:
                    string: comma separated additional english definitions
    """
    letter_limit = 200 # How many letters to limit the return string
    first_defs = set(defn.lower() for defn in primary_eng_defns)
    
    # Use the rest of the english definitions, without repeating those
    filtered_defs = []
    seen = set()  # To track duplicates (case-insensitive).

    for senses in additional_lst:
        for s in senses:
        
        # for defn in sense.get("english_definitions", []):
            defn_lower = s.lower()
            # Add if not in first sense and not already seen
            if defn_lower not in first_defs and defn_lower not in seen:
                filtered_defs.append(s)
                seen.add(defn_lower)
    # Limit the total letters.
    letter_count = 0
    for i in range(len(filtered_defs)):
        single_def = filtered_defs[i]
        letter_count += len(single_def)
        if letter_count > letter_limit:
            filtered_defs = filtered_defs[:i]
            break
            
    return filtered_defs


In [245]:
filter_english_definitions([["old man", "mister"], ["manybar", "goatfish (Parupeneus)"]], ["uncle"])

['old man', 'mister', 'manybar', 'goatfish (Parupeneus)']

In [294]:
def transform(df):
    rdf = df.copy()
    rdf["english_definition"] = rdf["english_definition"].str.join(', ')
    rdf["grammar"] = df["grammar"].apply(lambda lst: [jmdict_tags_mapping[l] for l in lst])
    rdf["grammar"] = rdf["grammar"].str.join(', ')

    rdf["reduced_additional"] = df.apply(lambda x: filter_english_definitions(x["additional"], x["english_definition"]), axis=1)
    rdf["reduced_additional"] = rdf["reduced_additional"].str.join(', ')
    # rdf["reduced_additional"] = fdf["additional"].apply(
    #                                             lambda outer: "; ".join(
    #                                                 ", ".join(inner) for inner in outer
    #                                             )
    # )

    # Is the word usually written in kana?
    rdf["usually_kana"] = df["misc"].apply( lambda lst: True if "uk" in lst else False)

    # Make the furigana reading, but not necessary if the word is usually_kana
    rdf["reading"] = rdf.apply(
    		lambda row: row["reading_kana"] if row["usually_kana"]=="usually_kana" else \
                                make_furigana(row["reading_kanji"], row["reading_kana"]),
                            axis=1)

    # Formality of the word
    formal_tags = ["hon", "pol", "hum"]
    formal_map = {"hon": "honorific/尊敬語",
                  "pol": "polite/丁寧語",
                  "hum": "humble/ 謙譲語"}
    rdf["formality"] = df["misc"].apply(lambda lst: [x for x in lst if x in formal_tags])
    rdf["formality"] = rdf["formality"].apply(lambda lst: [formal_map[l] for l in lst])

    rdf["expression"] = df.apply(lambda x: x["reading_kanji"] if x["reading_kanji"] != "" else x["reading_kana"], axis=1)

    rdf["tags"] = rdf.apply( lambda x: "usually_kana" if x["usually_kana"] else  "", axis = 1)
    rdf["tags"] = rdf.apply( lambda x: x["formality"] + [x["tags"]], axis=1)
    
    rdf = rdf.drop(["kana", "kanji", "waller_definition", "additional", "misc", "reading_kanji", "reading_kana", "usually_kana", "formality"], axis=1)

    return rdf

In [297]:
transform(df)

Unnamed: 0,jmdict_seq,jlpt_level,english_definition,grammar,reduced_additional,reading,expression,tags
0,1198180,N5,"to meet, to encounter, to see","Godan verb with 'u' ending, intransitive verb","to have an accident, to have a bad experience",会[あ]う,会う,[]
1,1381380,N5,"blue, azure","noun, nouns which may take the genitive case p...",green,青[あお],青,[]
2,1381390,N5,"blue, azure",adjective (keiyoushi),"green, pale (facial color), gray, grey, unripe...",青[あお]い,青い,[]
3,2013900,N5,"red, crimson, scarlet","noun, nouns which may take the genitive case p...",,赤[あか],赤,[]
4,1383240,N5,"red, crimson, scarlet, vermilion",adjective (keiyoushi),communist,赤[あか]い,赤い,[]
...,...,...,...,...,...,...,...,...
8287,1146160,N1,"rental car, rent-a-car, hire car, hired car",noun,"car rental agency, car hire agency, hire car a...",レンタカー,レンタカー,[]
8288,1146230,N1,X-ray,noun,roentgen (unit of ionizing radiation),レントゲン,レントゲン,[]
8289,1146750,N1,rope,noun,,ロープ,ロープ,[]
8291,1148010,N1,romantic,adjectival nouns or quasi-adjectives (keiyodoshi),,ロマンチック,ロマンチック,[]
