# Load JMdict file into a Pandas data frame
Dowload JMdict from https://www.edrdg.org/wiki/index.php/JMdict-EDICT_Dictionary_Project.

In [1]:
import numpy as np
import pandas as pd
from xmltodict import parse

## Load the xml file into a big `dict`

In [4]:
with open('./data/JMdict_e_examp.xml', 'r') as file:
    xml_jmdict = file.read()

In [5]:
dict_jmdict = parse(xml_jmdict, disable_entities=False)

## Example entry

In [6]:
# dict_jmdict['JMdict']['entry'][np.random.randint(0, len(dict_jmdict['JMdict']['entry']))]
dict_jmdict['JMdict']['entry'][101035]
# dict_jmdict['JMdict']['entry'][42592]

OrderedDict([('ent_seq', '2096470'),
             ('r_ele', OrderedDict([('reb', 'そぼろ'), ('re_pri', 'spec1')])),
             ('sense',
              [OrderedDict([('pos', 'noun (common) (futsuumeishi)'),
                            ('field', 'food, cooking'),
                            ('gloss',
                             OrderedDict([('@xml:lang', 'eng'),
                                          ('#text',
                                           'minced meat or fish that is seasoned and fried (usu. served on top of rice)')]))]),
               OrderedDict([('pos',
                             'adjectival nouns or quasi-adjectives (keiyodoshi)'),
                            ('gloss',
                             [OrderedDict([('@xml:lang', 'eng'),
                                           ('#text', 'tattered')]),
                              OrderedDict([('@xml:lang', 'eng'),
                                           ('#text', 'ragged')]),
                              Ordere

## A useful function

In [7]:
def get_values_matching_keys_all(dict_list, key_to_match, values_to_match, key_to_return):
    dict_out = {}
    
    if type(dict_list) is not list:
        dict_list = [dict_list]
        
    for d in dict_list:
        for v in values_to_match:
            
            if key_to_match in d:
                if d[key_to_match] == v:
                    if v in dict_out and dict_out[v]:
                        dict_out[v].append(d[key_to_return])
                    else:
                        dict_out[v] = [d[key_to_return]]
                        
            if v not in dict_out:
                dict_out[v] = None
                
    return dict_out

## Find dimensions to preallocate the destination df

In [8]:
N_ROWS = len(dict_jmdict['JMdict']['entry'])
MAX_N_SENSES = max([len(k['sense']) for k in dict_jmdict['JMdict']['entry']
                    if type(k['sense']) is list])

In [9]:
cols = (['spelling', 'readings'] + ['sense_{}'.format(i+1) for i in range(MAX_N_SENSES)]
                                 + ['x_ref_{}'.format(i+1) for i in range(MAX_N_SENSES)]
                                 + ['pos_{}'.format(i+1) for i in range(MAX_N_SENSES)]
                                 + ['misc_{}'.format(i+1) for i in range(MAX_N_SENSES)])

## Parse the `dict` into a Pandas data frame

In [10]:
pd_jmdict = pd.DataFrame(index=range(N_ROWS), columns=cols)

for i, k in enumerate(dict_jmdict['JMdict']['entry']):
    row = {}
    
    if 'k_ele' in k:
        if type(k['k_ele']) is list:
            pd_jmdict.at[i, 'spelling'] = [d['keb'] for d in k['k_ele']]
        else:
            pd_jmdict.at[i, 'spelling'] = [k['k_ele']['keb']]
    else:
        pd_jmdict.at[i, 'spelling'] = [None]
    
    if 'r_ele' in k:
        if type(k['r_ele']) is list:
            pd_jmdict.at[i, 'readings'] = [d['reb'] for d in k['r_ele']]
        else:
            pd_jmdict.at[i, 'readings'] = [k['r_ele']['reb']]
    
    if type(k['sense']) is list:
        senses = k['sense']
    else:
        senses = [k['sense']]
    
    for j, s in enumerate(senses):
        if 'pos' in s:
            # Part of speech applies to later senses unless a new one is specified
            stored_pos = s['pos']
        if type(stored_pos) is list:
            pd_jmdict.at[i, 'pos_{}'.format(j+1)] = '; '.join(stored_pos)
        else:
            pd_jmdict.at[i, 'pos_{}'.format(j+1)] = stored_pos
            
        if 'gloss' in s:
            d = get_values_matching_keys_all(s['gloss'], '@xml:lang', ['eng'], '#text')
            pd_jmdict.at[i, 'sense_{}'.format(j+1)] = ', '.join(d['eng'])
            
        if 'xref' in s:
            pd_jmdict.at[i, 'x_ref_{}'.format(j+1)] = ', '.join(s['xref']) if type(s['xref']) is list else s['xref']
        
        if 'misc' in s:
            pd_jmdict.at[i, 'misc_{}'.format(j+1)] = ', '.join(s['misc']) if type(s['misc']) is list else s['misc']

## Change the data frame to vertical format

In [11]:
pd_jmdict['main_spelling'] = pd_jmdict['spelling'].apply(lambda x: x[0])
pd_jmdict['main_reading'] = pd_jmdict['readings'].apply(lambda x: x[0])

pd_jmdict['alt_spellings'] = pd_jmdict['spelling'].apply(lambda x: ', '.join(x[1:]) if len(x) > 1 else None)
pd_jmdict['alt_readings'] = pd_jmdict['readings'].apply(lambda x: ', '.join(x[1:]) if len(x) > 1 else None)

In [12]:
df_meanings = (
    pd_jmdict.melt(id_vars=['main_spelling', 'alt_spellings', 'main_reading', 'alt_readings'],
                   value_vars=['sense_{}'.format(i+1) for i in range(MAX_N_SENSES)],
                   var_name='sense_no', value_name='meaning')
)

df_meanings = df_meanings.loc[df_meanings['meaning'].notnull()].reset_index(drop=True)
df_meanings['sense_no'] = df_meanings['sense_no'].apply(lambda x: int(x.split('_')[1]))

In [13]:
df_pos = (
    pd_jmdict.melt(id_vars=['main_spelling', 'main_reading'],
                   value_vars=['pos_{}'.format(i+1) for i in range(MAX_N_SENSES)],
                   var_name='sense_no', value_name='part_of_speech')
)

df_pos = df_pos.loc[df_pos['part_of_speech'].notnull()].reset_index(drop=True)
df_pos['sense_no'] = df_pos['sense_no'].apply(lambda x: int(x.split('_')[1]))

In [14]:
df_xref = (
    pd_jmdict.melt(id_vars=['main_spelling', 'main_reading'],
                   value_vars=['x_ref_{}'.format(i+1) for i in range(MAX_N_SENSES)],
                   var_name='sense_no', value_name='xref')
)

df_xref = df_xref.loc[df_xref['xref'].notnull()].reset_index(drop=True)
df_xref['sense_no'] = df_xref['sense_no'].apply(lambda x: int(x.split('_')[2]))

In [15]:
df_misc = (
    pd_jmdict.melt(id_vars=['main_spelling', 'main_reading'],
                   value_vars=['misc_{}'.format(i+1) for i in range(MAX_N_SENSES)],
                   var_name='sense_no', value_name='remarks')
)

df_misc = df_misc.loc[df_misc['remarks'].notnull()].reset_index(drop=True)
df_misc['sense_no'] = df_misc['sense_no'].apply(lambda x: int(x.split('_')[1]))

In [16]:
df = (
    df_meanings.merge(df_pos, on=['main_spelling', 'main_reading', 'sense_no'], how='left')
               .merge(df_misc, on=['main_spelling', 'main_reading', 'sense_no'], how='left')
               .merge(df_xref, on=['main_spelling', 'main_reading', 'sense_no'], how='left')
).sort_values(['main_spelling', 'main_reading', 'sense_no'])

In [17]:
df.sample(frac=0.0001)

Unnamed: 0,main_spelling,alt_spellings,main_reading,alt_readings,sense_no,meaning,part_of_speech,remarks,xref
44928,博引旁証,博引傍証,はくいんぼうしょう,,1,citing copious references,noun (common) (futsuumeishi); noun or particip...,yojijukugo,
205597,,,やあ,,2,Wow! (expression of surprise),interjection (kandoushi),,
53265,老人ホーム,,ろうじんホーム,,1,"retirement home, old people's home, senior cit...",noun (common) (futsuumeishi),,
44976,白眼視,,はくがんし,,1,"looking coldly on, looking upon disapprovingly...",noun (common) (futsuumeishi); noun or particip...,,
87655,共同義務者,,きょうどうぎむしゃ,,1,co-debtor,noun (common) (futsuumeishi),,
120214,グローバル宣言,,グローバルせんげん,,1,global declaration,noun (common) (futsuumeishi),,
13428,,,ローラー,,1,roller,noun (common) (futsuumeishi),,
3593,,,ガスタービン,ガス・タービン,1,gas turbine,noun (common) (futsuumeishi),,
181975,,,チクル,,1,chicle (main ingredient of chewing gum),noun (common) (futsuumeishi),,
87188,道路計画,,どうろけいかく,,1,road plan,noun (common) (futsuumeishi),,


## Export the Pandas data frame to a parquet file

In [18]:
df.to_parquet('./data/jmdict.parquet')