In [1]:
import os

import pandas as pd
import pandera as pa
from pandera import Column, DataFrameSchema, Check
import requests

In [2]:
def load_jlpt_csvs(folder_path):
    dfs = []
    
    for level in ["n5", "n4", "n3", "n2", "n1"]:
        csv_path = os.path.join(folder_path, f"{level}.csv")
        if os.path.exists(csv_path):
            df = pd.read_csv(csv_path)
            df["jlpt_level"] = level.upper()
            dfs.append(df)
    
    if dfs:
        merged_df = pd.concat(dfs, ignore_index=True)
        return merged_df
    else:
        return None


In [3]:
df = load_jlpt_csvs("original_data")
df

Unnamed: 0,jmdict_seq,kana,kanji,waller_definition,jlpt_level
0,1198180.0,あう,会う,to meet,N5
1,1381380.0,あお,青,blue,N5
2,1381390.0,あおい,青い,blue,N5
3,2013900.0,あか,赤,red,N5
4,1383240.0,あかい,赤い,red,N5
...,...,...,...,...,...
8288,1146230.0,レントゲン,,X-ray (lit: Roentgen),N1
8289,1146750.0,ロープ,,rope,N1
8290,1146810.0,ローマじ,ローマ字,"romanization, Roman letters",N1
8291,1148010.0,ロマンチック,,romantic,N1


In [4]:
df = df.dropna()  # Drop any rows with NaN values
df["jmdict_seq"] = df["jmdict_seq"].astype(int) # convert from floats to ints
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,jmdict_seq,kana,kanji,waller_definition,jlpt_level
0,1198180,あう,会う,to meet,N5
1,1381380,あお,青,blue,N5
2,1381390,あおい,青い,blue,N5
3,2013900,あか,赤,red,N5
4,1383240,あかい,赤い,red,N5
...,...,...,...,...,...
8096,1019210,アルカリ,亜爾加里,alkali,N1
8112,1031600,オーケー,ＯＫ,OK,N1
8157,1064050,ジーパン,Ｇパン,"jeans (lit: jeans pants), dungarees",N1
8193,1076650,ダース,打,dozen,N1


In [5]:
duplicate_counts = df['jmdict_seq'].value_counts()
repeats = duplicate_counts[duplicate_counts > 1]
repeats

1576150    3
1589780    3
1352570    3
1240530    3
1600650    3
          ..
1582390    2
2261500    2
1587040    2
1578850    2
1600910    2
Name: jmdict_seq, Length: 230, dtype: int64

In [6]:
df[df['jmdict_seq'] == 1579840]

Unnamed: 0,jmdict_seq,kana,kanji,waller_definition,jlpt_level
295,1579840,じゅう,十,Ten,N5
296,1579840,とお,十,Ten,N5


In [7]:
def transform_j(df):
    # Drop duplicates in jmdict_seq, keeping the lowest/easiest level (which comes first in the df)
    dupes = df[df.duplicated(subset="jmdict_seq", keep="first")]
    print("Duplicated rows dropped:")
    print(dupes["jmdict_seq"])
    df = df.drop(dupes.index)
    return df

In [8]:
transform_j(df)
df

Duplicated rows dropped:
172     1390020
188     1578150
271     1579470
280     1319210
296     1579840
         ...   
7890    1502690
7911    1584820
7970    1605820
8088    1606880
8089    1606950
Name: jmdict_seq, Length: 237, dtype: int64


Unnamed: 0,jmdict_seq,kana,kanji,waller_definition,jlpt_level
0,1198180,あう,会う,to meet,N5
1,1381380,あお,青,blue,N5
2,1381390,あおい,青い,blue,N5
3,2013900,あか,赤,red,N5
4,1383240,あかい,赤い,red,N5
...,...,...,...,...,...
8096,1019210,アルカリ,亜爾加里,alkali,N1
8112,1031600,オーケー,ＯＫ,OK,N1
8157,1064050,ジーパン,Ｇパン,"jeans (lit: jeans pants), dungarees",N1
8193,1076650,ダース,打,dozen,N1


## My ones from jisho

In [9]:
def load_my_csvs(folder_path):
    dfs = []
    
    for level in ["n5", "n4", "n3", "n2", "n1"]:
        csv_path = os.path.join(folder_path, f"jlpt-{level}-normal.csv")
        if os.path.exists(csv_path):
            df = pd.read_csv(csv_path)
            # df["jlpt_level"] = level.upper()
            dfs.append(df)
    
    if dfs:
        merged_df = pd.concat(dfs, ignore_index=True)
        return merged_df
    else:
        return None


In [10]:
f = load_my_csvs("output")
f

Unnamed: 0,expression,reading,english_definition,grammar,additional,tags,japanese_reading
0,学校,学校[がっこう],school,Noun,,['jlpt-n5'],がっこう
1,川,川[かわ],"river, stream",Noun,the ... river,"['jlpt-n3', 'jlpt-n5']",かわ
2,手,手[て],"hand, arm",Noun,"forepaw, foreleg, handle, worker, help, troubl...",['jlpt-n5'],て
3,戸,戸[と],door (esp. Japanese-style),Noun,"shutter, window shutter, entrance (to a home),...",['jlpt-n5'],と
4,メガネ,メガネ,"glasses, eyeglasses, spectacles",Noun,"judgment, judgement, discrimination, discernme...","['usually_kana', 'jlpt-n1', 'jlpt-n5']",メガネ
...,...,...,...,...,...,...,...
7730,ぼつぼつ,ぼつぼつ,"gradually, little by little, in a breezy manner","Adverb , Adverb taking the 'to' particle","soon, before long, here and there, scattered, ...",['jlpt-n1'],
7731,げっそり,げっそり,"to be disheartened, to be dejected, to be disc...","Suru verb, Adverb taking the 'to' particle","suddenly (losing a lot of weight), looking ema...",['jlpt-n1'],
7732,嫌に,嫌[いや]に,"awfully, terribly",Adverb,,"['usually_kana', 'jlpt-n1']",
7733,ふんだん,ふんだん,"plentiful, abundant, ample, lavish",Na-adjective,,['jlpt-n1'],


In [11]:
f[f["tags"].apply(lambda lst: "jlpt-n5" in lst)]

Unnamed: 0,expression,reading,english_definition,grammar,additional,tags,japanese_reading
0,学校,学校[がっこう],school,Noun,,['jlpt-n5'],がっこう
1,川,川[かわ],"river, stream",Noun,the ... river,"['jlpt-n3', 'jlpt-n5']",かわ
2,手,手[て],"hand, arm",Noun,"forepaw, foreleg, handle, worker, help, troubl...",['jlpt-n5'],て
3,戸,戸[と],door (esp. Japanese-style),Noun,"shutter, window shutter, entrance (to a home),...",['jlpt-n5'],と
4,メガネ,メガネ,"glasses, eyeglasses, spectacles",Noun,"judgment, judgement, discrimination, discernme...","['usually_kana', 'jlpt-n1', 'jlpt-n5']",メガネ
...,...,...,...,...,...,...,...
6704,何れ,何[ど]れ,which (of three or more),Pronoun,"well, now, c'mon","['usually_kana', 'jlpt-n3', 'jlpt-n1', 'jlpt-n5']",
6705,彼方,彼方[あちら],"that way, that direction, over there, yonder",Pronoun,"that (one), that person, there, foreign countr...","['usually_kana', 'jlpt-n1', 'jlpt-n5']",
6706,７日,７ 日[のか],7th day of the month,Noun,seven days,"['jlpt-n1', 'jlpt-n5']",
6707,然うして,然[そ]うして,"and, like that",Conjunction,,"['usually_kana', 'jlpt-n2', 'jlpt-n1', 'jlpt-n5']",


In [12]:
df[df["jlpt_level"]=="N5"]

Unnamed: 0,jmdict_seq,kana,kanji,waller_definition,jlpt_level
0,1198180,あう,会う,to meet,N5
1,1381380,あお,青,blue,N5
2,1381390,あおい,青い,blue,N5
3,2013900,あか,赤,red,N5
4,1383240,あかい,赤い,red,N5
...,...,...,...,...,...
678,1311110,わたし,私,"I,myself",N5
679,1444610,わたす,渡す,to hand over,N5
680,1444680,わたる,渡る,to go across,N5
681,1151260,わるい,悪い,bad,N5


In [13]:
dfw = df[df["jlpt_level"]=="N5"].reset_index(drop=True)
dfj = f[f["tags"].apply(lambda lst: "jlpt-n5" in lst)].reset_index(drop=True)

dfj = dfj.drop_duplicates(subset=["expression", "reading", "english_definition", "grammar"]).reset_index(drop=True)

In [14]:
def dataframe_diff(dfj, dfw, match_rules):
    # Compute matches as a boolean matrix
    matches = pd.DataFrame(False, index=dfj.index, columns=dfw.index)
    
    for rule in match_rules:
        rule_matches = dfj.apply(lambda r1: dfw.apply(lambda r2: rule(r1, r2), axis=1), axis=1)
        matches = matches | rule_matches  # OR together all rules

    # Rows in dfj with no matches in dfw
    dfj_missing = dfj[~matches.any(axis=1)].copy()
    
    # Rows in dfw with no matches in dfj
    dfw_missing = dfw[~matches.any(axis=0)].copy()
    
    return dfj_missing, dfw_missing

In [15]:
rule_express_equals_kanji = lambda rj, rw: rj["expression"] == rw["kanji"]
rule_reading_equals_kana = lambda rj, rw: rj["japanese_reading"] == rw["kana"]

In [16]:
dfj_missing, dfw_missing = dataframe_diff(dfj, dfw, [rule_express_equals_kanji, rule_reading_equals_kana])

In [17]:
dfj_missing

Unnamed: 0,expression,reading,english_definition,grammar,additional,tags,japanese_reading
4,メガネ,メガネ,"glasses, eyeglasses, spectacles",Noun,"judgment, judgement, discrimination, discernme...","['usually_kana', 'jlpt-n1', 'jlpt-n5']",メガネ
5,煙草,煙草[タバコ],"tobacco, cigarette, cigar",Noun,tobacco plant (Nicotiana tabacum),"['usually_kana', 'jlpt-n1', 'jlpt-n5']",タバコ
73,椅子,椅子[いす],"chair, seat, stool, bench",Noun,"post, position, office","['jlpt-n3', 'jlpt-n5']",いす
129,茶碗,茶碗[ちゃわん],"rice bowl, tea cup, teacup",Noun,,"['jlpt-n2', 'jlpt-n5']",ちゃわん
143,可愛い,可愛[かわ]い,"cute, adorable, charming, lovely, pretty",I-adjective,"dear, precious, darling, pet, innocent, childl...","['usually_kana', 'jlpt-n1', 'jlpt-n5']",かわいい
...,...,...,...,...,...,...,...
649,コート,コート,coat,Noun,coating,['jlpt-n5'],コート
650,洋杯,洋杯[コップ],"glass (drinking vessel), tumbler",Noun,cups (suit),"['usually_kana', 'jlpt-n5']",コップ
651,フォーク,フォーク,fork,Noun,forkball,['jlpt-n5'],フォーク
652,ラジカセ,ラジカセ,radio-cassette player,Noun,,['jlpt-n5'],ラジカセ


In [18]:
dfw_missing

Unnamed: 0,jmdict_seq,kana,kanji,waller_definition,jlpt_level
113,1203090,かいだん,階段,Stairs,N5
154,1578150,きゅう,九,Nine,N5
155,1578150,く,九,Nine,N5
347,1253020,とりにく,とり肉,chicken meat,N5
379,1313000,は,歯,tooth,N5
383,1607260,はく,履く,"to wear,to put on trousers",N5
492,1577670,めがね,眼鏡,glasses,N5
507,1012980,やる,遣る,to do,N5


In [19]:
dfj

Unnamed: 0,expression,reading,english_definition,grammar,additional,tags,japanese_reading
0,学校,学校[がっこう],school,Noun,,['jlpt-n5'],がっこう
1,川,川[かわ],"river, stream",Noun,the ... river,"['jlpt-n3', 'jlpt-n5']",かわ
2,手,手[て],"hand, arm",Noun,"forepaw, foreleg, handle, worker, help, troubl...",['jlpt-n5'],て
3,戸,戸[と],door (esp. Japanese-style),Noun,"shutter, window shutter, entrance (to a home),...",['jlpt-n5'],と
4,メガネ,メガネ,"glasses, eyeglasses, spectacles",Noun,"judgment, judgement, discrimination, discernme...","['usually_kana', 'jlpt-n1', 'jlpt-n5']",メガネ
...,...,...,...,...,...,...,...
649,コート,コート,coat,Noun,coating,['jlpt-n5'],コート
650,洋杯,洋杯[コップ],"glass (drinking vessel), tumbler",Noun,cups (suit),"['usually_kana', 'jlpt-n5']",コップ
651,フォーク,フォーク,fork,Noun,forkball,['jlpt-n5'],フォーク
652,ラジカセ,ラジカセ,radio-cassette player,Noun,,['jlpt-n5'],ラジカセ


weiler's resources seem pretty close in comparison. Needs some sorting is all

In [20]:
schema = DataFrameSchema({
    "jmdict_seq": Column(
        int,
        checks=[Check.ge(0)],  # unique and >= 0
        nullable=False
    ),
    "kana": Column(
        str,
        nullable=False
    ),
    "kanji": Column(
        str,
        nullable=False
    ),
    "waller_definition": Column(
        str,
        nullable=False
    )
})


In [21]:
df = df.dropna()  # Drop any rows with NaN values
df["jmdict_seq"] = df["jmdict_seq"].astype(int) # convert from floats to ints


In [77]:
df = schema.validate(df)

df

Unnamed: 0,jmdict_seq,kana,kanji,waller_definition,jlpt_level
0,1198180,あう,会う,to meet,N5
1,1381380,あお,青,blue,N5
2,1381390,あおい,青い,blue,N5
3,2013900,あか,赤,red,N5
4,1383240,あかい,赤い,red,N5
...,...,...,...,...,...
8096,1019210,アルカリ,亜爾加里,alkali,N1
8112,1031600,オーケー,ＯＫ,OK,N1
8157,1064050,ジーパン,Ｇパン,"jeans (lit: jeans pants), dungarees",N1
8193,1076650,ダース,打,dozen,N1


In [23]:
df.compare(validated_df)

In [24]:
def lookup_word(search_term):
    base_url = "https://jisho.org/api/v1/search/words"

    params = {
        "keyword": search_term,
        # "page": page,
    }
    response = requests.get(base_url, params=params)

    print(response.url)

    if response.status_code != 200:
        logging.error(f"Failed to fetch page {page}, status code: {response.status_code}")

    return response

In [25]:
lookup_word("会う").json()

https://jisho.org/api/v1/search/words?keyword=%E4%BC%9A%E3%81%86


{'meta': {'status': 200},
 'data': [{'slug': '会う',
   'is_common': True,
   'tags': ['wanikani5', 'wanikani52'],
   'jlpt': ['jlpt-n2', 'jlpt-n5'],
   'japanese': [{'word': '会う', 'reading': 'あう'},
    {'word': '逢う', 'reading': 'あう'},
    {'word': '遭う', 'reading': 'あう'},
    {'word': '遇う', 'reading': 'あう'}],
   'senses': [{'english_definitions': ['to meet', 'to encounter', 'to see'],
     'parts_of_speech': ["Godan verb with 'u' ending", 'Intransitive verb'],
     'links': [],
     'tags': [],
     'restrictions': [],
     'see_also': [],
     'antonyms': [],
     'source': [],
     'info': ['逢う is often used for close friends, etc. and may be associated with drama or pathos; 遭う may have an undesirable nuance']},
    {'english_definitions': ['to have an accident',
      'to have a bad experience'],
     'parts_of_speech': ["Godan verb with 'u' ending", 'Intransitive verb'],
     'links': [],
     'tags': ['Usually written using kana alone'],
     'restrictions': [],
     'see_also': [],

In [26]:
lookup_word("1198180").json()

https://jisho.org/api/v1/search/words?keyword=1198180


{'meta': {'status': 200}, 'data': []}

In [27]:
def check_jisho_response(w_data, j_r_data):
    first_result = j_r_data[0]
    jisho_kanji = first_result["japanese"][0]["word"] # match?
    jisho_kana = first_result["japanese"][0]["reading"] # match?


# Jamdict

In [28]:
from jamdict import Jamdict
jam = Jamdict()

# use wildcard matching to find anything starts with 食べ and ends with る
result = jam.lookup('食べ%る')

# print all word entries
for entry in result.entries:
     print(entry)

[id#1358280] たべる (食べる) : 1. to eat ((Ichidan verb|transitive verb)) 2. to live on (e.g. a salary)/to live off/to subsist on ((Ichidan verb|transitive verb))
[id#1358300] たべすぎる (食べ過ぎる) : to overeat ((Ichidan verb|transitive verb))
[id#1852290] たべつける (食べ付ける) : to be used to eating ((Ichidan verb|transitive verb))
[id#2145280] たべはじめる (食べ始める) : to start eating ((Ichidan verb))
[id#2449430] たべかける (食べ掛ける) : to start eating ((Ichidan verb))
[id#2671010] たべなれる (食べ慣れる) : to be used to eating/to become used to eating/to be accustomed to eating/to acquire a taste for ((Ichidan verb))
[id#2765050] たべられる (食べられる) : 1. to be able to eat ((Ichidan verb|intransitive verb)) 2. to be edible/to be good to eat ((pre-noun adjectival (rentaishi)))
[id#2795790] たべくらべる (食べ比べる) : to taste and compare several dishes (or foods) of the same type ((Ichidan verb|transitive verb))
[id#2807470] たべあわせる (食べ合わせる) : to eat together (various foods) ((Ichidan verb))
[id#2841209] たべあきる (食べ飽きる) : to get tired of eating/to hav

In [29]:
result.entries[0][0]

to eat ((Ichidan verb|transitive verb))

In [30]:
type(result.entries[0][0])

jamdict.jmdict.Sense

In [117]:
jam.lookup('id#1223021')

Found nothing

In [91]:
def extract_formality(jmdict_entry):
    """
    Extracts the formality tags from a string array
    
    Args:
                    jmdict_entry: jam lookup single entry for a word
    Returns:
                    array: formalities this word/sense is
    """
    acceptable_formalities = ["humble (kenjougo) language", "honorific or respectful (sonkeigo) language", "polite (teineigo) language"]
    # a list of pairs. The first is the entry to accept (case insensitive).
    # The latter is what will be provided into the final formality string
    accept = {
        "humble (kenjougo) language": "humble/謙譲語",
        "honorific or respectful (sonkeigo) language": "respectful/尊敬語",
        "polite (teineigo) language": "polite/丁寧語",
    }
    formalities = []
    for t in jmdict_entry.senses[0].misc:
        # print(t)
        if t.lower() in acceptable_formalities:
            formalities.append(t)
    return formalities


In [114]:
def lookup_dict(dict_id: int):
    entries = jam.lookup(f"id#{dict_id}").entries
    if len(entries) < 1:
        print(dict_id)
        return None
    entry = jam.lookup(f"id#{dict_id}").entries[0]
    newdict = {
        "expression": entry.kanji_forms[0],
        "reading_kanji": entry.kanji_forms[0],
        "reading_kan": entry.kana_forms[0],
        "english_definition": entry.senses[0].gloss,
        "grammar": entry.senses[0].pos,
        "additional": [entry.senses[i].gloss for i in range(1,len(entry.senses))],
        "formality": extract_formality(entry),
        "usually_kana": 'word usually written using kana alone' in entry.senses[0].misc
    }
    return newdict
# expression,reading,english_definition,grammar,additional,tags,japanese_reading

In [95]:
lookup_dict(1002610)

{'expression': お腹,
 'reading_kanji': お腹,
 'reading_kan': おなか,
 'english_definition': [belly, abdomen, stomach],
 'grammar': ['noun (common) (futsuumeishi)'],
 'additional': [],
 'formality': ['polite (teineigo) language'],
 'usually_kana': False}

In [120]:
jam.lookup(f"id#1223020")

[Entries]。#1: きそく (規則) : rule/regulation | [Chars]。規, 則

In [115]:
df_new_cols = df.apply(lambda x: lookup_dict(x["jmdict_seq"]), axis=1, result_type="expand")

1223021
2857380
2859161
2853884
2863051
2864817
2864818
2857436
2850084
2856318
2861111
2857870
2862924
2859682
2859332
2858986
2854424
2857913
2855480
2858064
2859761
2866134
2862467


IndexError: list index out of range

In [None]:
df_new_cols

In [84]:
pd.concat([df, df_new_cols], axis=1)

Unnamed: 0,jmdict_seq,kana,kanji,waller_definition,jlpt_level,0
0,1198180,あう,会う,to meet,N5,
1,1381380,あお,青,blue,N5,
2,1381390,あおい,青い,blue,N5,
3,2013900,あか,赤,red,N5,
4,1383240,あかい,赤い,red,N5,
...,...,...,...,...,...,...
8096,1019210,アルカリ,亜爾加里,alkali,N1,
8112,1031600,オーケー,ＯＫ,OK,N1,
8157,1064050,ジーパン,Ｇパン,"jeans (lit: jeans pants), dungarees",N1,
8193,1076650,ダース,打,dozen,N1,


# jmdict json

In [176]:
import json
import ast

In [150]:
with open("original_data/jmdict-eng-3.6.1.json", "r") as f:
    data = json.load(f)

jmdict = pd.DataFrame(data["words"])
jmdict

Unnamed: 0,id,kanji,kana,sense
0,1000000,[],"[{'common': False, 'text': 'ヽ', 'tags': [], 'a...","[{'partOfSpeech': ['unc'], 'appliesToKanji': [..."
1,1000010,[],"[{'common': False, 'text': 'ヾ', 'tags': [], 'a...","[{'partOfSpeech': ['unc'], 'appliesToKanji': [..."
2,1000020,[],"[{'common': False, 'text': 'ゝ', 'tags': [], 'a...","[{'partOfSpeech': ['unc'], 'appliesToKanji': [..."
3,1000030,[],"[{'common': False, 'text': 'ゞ', 'tags': [], 'a...","[{'partOfSpeech': ['unc'], 'appliesToKanji': [..."
4,1000040,"[{'common': False, 'text': '〃', 'tags': []}]","[{'common': False, 'text': 'おなじ', 'tags': [], ...","[{'partOfSpeech': ['n'], 'appliesToKanji': ['*..."
...,...,...,...,...
214164,5747433,"[{'common': False, 'text': '株探', 'tags': []}]","[{'common': False, 'text': 'かぶたん', 'tags': [],...","[{'partOfSpeech': ['n'], 'appliesToKanji': ['*..."
214165,5747436,"[{'common': False, 'text': 'Ｐｉｘ', 'tags': []},...","[{'common': False, 'text': 'ピックス', 'tags': [],...","[{'partOfSpeech': ['n'], 'appliesToKanji': ['*..."
214166,5747446,"[{'common': False, 'text': '日本専門医機構', 'tags': ...","[{'common': False, 'text': 'にほんせんもんいきこう', 'tag...","[{'partOfSpeech': ['n'], 'appliesToKanji': ['*..."
214167,5747447,"[{'common': False, 'text': '日本歯科専門医機構', 'tags'...","[{'common': False, 'text': 'にほんしかせんもんいきこう', 't...","[{'partOfSpeech': ['n'], 'appliesToKanji': ['*..."


In [218]:
# What the short tags used in the dict mean
jmdict_tags_mapping = data["tags"]
jmdict_tags_mapping

{'v5uru': 'Godan verb - Uru old class verb (old form of Eru)',
 'v2g-s': "Nidan verb (lower class) with 'gu' ending (archaic)",
 'dei': 'deity',
 'ship': 'ship name',
 'leg': 'legend',
 'bra': 'Brazilian',
 'music': 'music',
 'quote': 'quotation',
 'pref': 'prefix',
 'ktb': 'Kantou-ben',
 'rK': 'rarely used kanji form',
 'derog': 'derogatory',
 'abbr': 'abbreviation',
 'exp': 'expressions (phrases, clauses, etc.)',
 'astron': 'astronomy',
 'v2g-k': "Nidan verb (upper class) with 'gu' ending (archaic)",
 'aux-v': 'auxiliary verb',
 'ctr': 'counter',
 'surg': 'surgery',
 'baseb': 'baseball',
 'serv': 'service',
 'genet': 'genetics',
 'geogr': 'geography',
 'dent': 'dentistry',
 'v5k-s': 'Godan verb - Iku/Yuku special class',
 'horse': 'horse racing',
 'ornith': 'ornithology',
 'v2w-s': "Nidan verb (lower class) with 'u' ending and 'we' conjugation (archaic)",
 'sK': 'search-only kanji form',
 'rk': 'rarely used kana form',
 'hob': 'Hokkaido-ben',
 'male': 'male term or language',
 'motor

In [229]:
jmdict[jmdict["id"]==2864817]

Unnamed: 0,id,kanji,kana,sense
203912,2864817,"[{'common': False, 'text': '揚げる', 'tags': []}]","[{'common': False, 'text': 'あげる', 'tags': [], ...","[{'partOfSpeech': ['v1', 'vt'], 'appliesToKanj..."


In [240]:
jmdict[jmdict["id"]==2864817]["sense"].iloc[0]

[{'partOfSpeech': ['v1', 'vt'],
  'appliesToKanji': ['*'],
  'appliesToKana': ['*'],
  'related': [],
  'antonym': [],
  'field': [],
  'dialect': [],
  'misc': [],
  'info': [],
  'languageSource': [],
  'gloss': [{'lang': 'eng',
    'gender': None,
    'type': None,
    'text': 'to deep-fry'},
   {'lang': 'eng',
    'gender': None,
    'type': None,
    'text': 'to make deep-fried food'}]},
 {'partOfSpeech': ['v1', 'vt'],
  'appliesToKanji': ['*'],
  'appliesToKana': ['*'],
  'related': [],
  'antonym': [],
  'field': [],
  'dialect': [],
  'misc': [],
  'info': [],
  'languageSource': [],
  'gloss': [{'lang': 'eng',
    'gender': None,
    'type': None,
    'text': 'to launch (fireworks, etc.)'},
   {'lang': 'eng',
    'gender': None,
    'type': None,
    'text': 'to hoist (e.g. a flag)'},
   {'lang': 'eng',
    'gender': None,
    'type': None,
    'text': 'to (let) fly (e.g. a kite)'},
   {'lang': 'eng', 'gender': None, 'type': None, 'text': 'to set off'}]},
 {'partOfSpeech': ['v

In [254]:
def extract_addition_engl(entry):
    """
    Extract all the additional definitions for the word
    """
    adds = []
    for i in entry["sense"].iloc[0][1:]:
        if:
            # some checks like not oldfashioned,
            pass
        adds.append([x["text"] for x in i["gloss"]])
    return adds

In [255]:
def lookup_dict(dict_id: int):
    """
    Using the jmdict in json form (https://github.com/scriptin/jmdict-simplified/) imported as a pandas dataframe.
    """
    entry = jmdict[jmdict["id"] == dict_id]
    if len(entry) < 1:
        print(f"Not found {dict_id}")

    # Need .iloc[0][0] structure due to importing nested json into dataframe
    
    kanji = entry["kanji"].iloc[0][0]["text"]
    kana = entry["kana"].iloc[0][0]["text"]

    additional = extract_addition_engl(entry)
            
    
    # entries = jam.lookup(f"id#{dict_id}").entries
    # if len(entries) < 1:
    #     print(dict_id)
    #     return None
    # entry = jam.lookup(f"id#{dict_id}").entries[0]
    newdict = {
        # "expression": entry.kanji_forms[0],
        "reading_kanji": kanji,
        "reading_kana": kana,
        "english_definition": [x["text"] for x in entry["sense"].iloc[0][0]["gloss"]],
        "grammar": entry["sense"].iloc[0][0]["partOfSpeech"],
        "additional": additional,
        # "formality": entry["sense"].iloc[0][0]["misc"], #pol, hon,
        # "usually_kana": entry["sense"].iloc[0][0]["misc"], #uk
        "misc": entry["sense"].iloc[0][0]["misc"],
    }
    return newdict
# expression,reading,english_definition,grammar,additional,tags,japanese_reading

In [258]:
lookup_dict(1201260)

{'reading_kanji': '海外',
 'reading_kana': 'かいがい',
 'english_definition': ['foreign', 'abroad', 'overseas'],
 'grammar': ['n', 'adj-no'],
 'additional': [],
 'misc': []}

In [202]:
[x["text"] for x in jmdict[jmdict["id"]==2864817]["sense"].iloc[0][0]["gloss"]]

['to deep-fry', 'to make deep-fried food']

In [236]:
for i in len(jmdict[jmdict["id"] == 2864817]["sense"].size):
    print(i)

TypeError: object of type 'int' has no len()

In [238]:
jmdict[jmdict["id"] == 2864817]["sense"].

(1,)

In [261]:
for i in jmdict[jmdict["id"]==2864817]["sense"].iloc[0][1:]:
    print(i)
    # print([x["text"] for x in i["gloss"]])
    print("")

{'partOfSpeech': ['v1', 'vt'], 'appliesToKanji': ['*'], 'appliesToKana': ['*'], 'related': [], 'antonym': [], 'field': [], 'dialect': [], 'misc': [], 'info': [], 'languageSource': [], 'gloss': [{'lang': 'eng', 'gender': None, 'type': None, 'text': 'to launch (fireworks, etc.)'}, {'lang': 'eng', 'gender': None, 'type': None, 'text': 'to hoist (e.g. a flag)'}, {'lang': 'eng', 'gender': None, 'type': None, 'text': 'to (let) fly (e.g. a kite)'}, {'lang': 'eng', 'gender': None, 'type': None, 'text': 'to set off'}]}

{'partOfSpeech': ['v1', 'vt'], 'appliesToKanji': ['*'], 'appliesToKana': ['*'], 'related': [], 'antonym': [], 'field': [], 'dialect': [], 'misc': [], 'info': [], 'languageSource': [], 'gloss': [{'lang': 'eng', 'gender': None, 'type': None, 'text': 'to summon (for geishas, etc.)'}, {'lang': 'eng', 'gender': None, 'type': None, 'text': 'to call in'}]}

{'partOfSpeech': ['v1', 'vt'], 'appliesToKanji': ['*'], 'appliesToKana': ['*'], 'related': [], 'antonym': [], 'field': [], 'dialec