# Load KANJIDIC file into a Pandas data frame
Download KANJDIC from https://www.edrdg.org/wiki/index.php/KANJIDIC_Project.

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
DATA_PATH = '/Users/glillacci/OneDrive - Tesco/Personal/Data/kanjidata'

## Parse xml file

In [3]:
with open(f'{DATA_PATH}/kanjidic2.xml', 'r') as file:
    xml_kanjidic = file.read()

In [4]:
soup = BeautifulSoup(xml_kanjidic, 'xml')

In [5]:
characters = soup.find_all('character')

## Example entry

In [6]:
k = characters[1664]

In [7]:
k

<character>
<literal>巣</literal>
<codepoint>
<cp_value cp_type="ucs">5de3</cp_value>
<cp_value cp_type="jis208">1-33-67</cp_value>
</codepoint>
<radical>
<rad_value rad_type="classical">75</rad_value>
<rad_value rad_type="nelson_c">3</rad_value>
</radical>
<misc>
<grade>4</grade>
<stroke_count>11</stroke_count>
<freq>1588</freq>
<jlpt>1</jlpt>
</misc>
<dic_number>
<dic_ref dr_type="nelson_c">141</dic_ref>
<dic_ref dr_type="nelson_n">2705</dic_ref>
<dic_ref dr_type="halpern_njecd">2295</dic_ref>
<dic_ref dr_type="halpern_kkd">2861</dic_ref>
<dic_ref dr_type="halpern_kkld">1477</dic_ref>
<dic_ref dr_type="halpern_kkld_2ed">1987</dic_ref>
<dic_ref dr_type="heisig">1927</dic_ref>
<dic_ref dr_type="heisig6">2077</dic_ref>
<dic_ref dr_type="gakken">1233</dic_ref>
<dic_ref dr_type="oneill_names">1431</dic_ref>
<dic_ref dr_type="oneill_kk">1491</dic_ref>
<dic_ref dr_type="moro" m_page="0340" m_vol="4">8696P</dic_ref>
<dic_ref dr_type="henshall">1521</dic_ref>
<dic_ref dr_type="sh_kk">1538</dic

## Some useful functions

In [8]:
def content_if_exists(tag):
    """Return content of tag if it exists."""
    if tag:
        return tag.string
    return None

In [9]:
def content_list(tag_list):
    """Return list of contents from list of tags."""
    if len(tag_list):
        return [t.string for t in tag_list]
    return None

## Parse the `dict` into a Pandas data frame

In [10]:
N_ROWS = len(characters)

COLS = ['kanji', 'jis208', 'jis212', 'jis213', 'ucs', 'radical', 'radical_name', 'stroke_count', 'grade',
        'frequency', 'jlpt_level', 'variant_of_jis', 'njecd_no', 'kkd_no', 'kkld_no', 'kkld2_no', 'skip_code',
        'onyomi', 'kunyomi', 'nanori', 'meanings']

In [11]:
pd_kanjidic = pd.DataFrame(index=range(N_ROWS), columns=COLS)


for i, k in enumerate(characters):
    # Entry character
    pd_kanjidic.at[i, 'kanji'] = k.literal.string
    
    # Character code points: JIS208, JIS212, JIS213 and Unicode
    for cp in k.codepoint('cp_value'):
        pd_kanjidic.at[i, cp['cp_type']] = cp.string
    
    # Radical and radical name
    pd_kanjidic.at[i, 'radical'] = k.find(rad_type='classical').contents[0]
    pd_kanjidic.at[i, 'radical_name'] = content_list(k.misc('rad_name'))
    
    # Miscellaneous info
    pd_kanjidic.at[i, 'stroke_count'] = k.misc.stroke_count.string
    pd_kanjidic.at[i, 'grade'] = content_if_exists(k.misc.grade)
    pd_kanjidic.at[i, 'frequency'] = content_if_exists(k.misc.frequency)
    pd_kanjidic.at[i, 'jlpt_level'] = content_if_exists(k.misc.jlpt)
    
    # Dictionary references
    pd_kanjidic.at[i, 'njecd_no'] = content_if_exists(k.find(dr_type='halpern_njecd'))
    pd_kanjidic.at[i, 'kkd_no'] = content_if_exists(k.find(dr_type='halpern_kkd'))
    pd_kanjidic.at[i, 'kkld_no'] = content_if_exists(k.find(dr_type='halpern_kkld'))
    pd_kanjidic.at[i, 'kkld2_no'] = content_if_exists(k.find(dr_type='halpern_kkld_2ed'))
    
    # SKIP code
    pd_kanjidic.at[i, 'skip_code'] = content_if_exists(k.find(qc_type='skip'))
    
    # Readings
    pd_kanjidic.at[i, 'onyomi'] = content_list(k.find_all(r_type='ja_on'))
    pd_kanjidic.at[i, 'kunyomi'] = content_list(k.find_all(r_type='ja_kun'))
    pd_kanjidic.at[i, 'nanori'] = content_list(k.find_all('nanori'))
    
    # Meanings
    pd_kanjidic.at[i, 'meanings'] = content_list(k.find_all('meaning', m_lang=''))


pd_kanjidic = pd_kanjidic.rename({
    'ucs': 'unicode'
}, axis=1)

pd_kanjidic['unicode'] = pd_kanjidic['unicode'].str.upper()
pd_kanjidic['radical'] = pd_kanjidic['radical'].astype(int)

## Post-process and add some extra columns

In [12]:
def post_process_stroke_count(sc_in):
    if type(sc_in) is list:
        return int(sc_in[0])
    return(int(sc_in))

In [13]:
def compute_jis_level(jis_cp):
    if not jis_cp:
        return 0
    if int(jis_cp[2:4]) <= 47:
        return 1
    return 2

In [14]:
def n_distinct_kunyomi(kunyomi_list):
    if kunyomi_list is None:
        return 0
    return len(set([k.split('.')[0].replace('-', '') for k in kunyomi_list]))

In [15]:
pd_kanjidic['stroke_count'] = pd_kanjidic['stroke_count'].apply(post_process_stroke_count)

In [16]:
pd_kanjidic['n_onyomi'] = pd_kanjidic['onyomi'].str.len().fillna(0).astype(int)
pd_kanjidic['n_nanori'] = pd_kanjidic['nanori'].str.len().fillna(0).astype(int)

In [17]:
pd_kanjidic['n_kunyomi'] = pd_kanjidic['kunyomi'].str.len().fillna(0).astype(int)
pd_kanjidic['n_kunyomi_distinct'] = pd_kanjidic['kunyomi'].apply(n_distinct_kunyomi)

In [18]:
pd_kanjidic['jis_level'] = pd_kanjidic['jis208'].fillna(0).apply(compute_jis_level)

In [19]:
pd_kanjidic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13108 entries, 0 to 13107
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   kanji               13108 non-null  object
 1   jis208              6355 non-null   object
 2   jis212              5801 non-null   object
 3   jis213              3695 non-null   object
 4   unicode             13108 non-null  object
 5   radical             13108 non-null  int64 
 6   radical_name        108 non-null    object
 7   stroke_count        13108 non-null  int64 
 8   grade               2998 non-null   object
 9   frequency           0 non-null      object
 10  jlpt_level          2230 non-null   object
 11  variant_of_jis      0 non-null      object
 12  njecd_no            3002 non-null   object
 13  kkd_no              3809 non-null   object
 14  kkld_no             2230 non-null   object
 15  kkld2_no            2904 non-null   object
 16  skip_code           13

## Manual corrections

### Update the grades according to the latest specs
Sources:
- https://www.kanken.or.jp/kanken/outline/data/outline_degree_national_list20200217.pdf.
- https://en.wikipedia.org/wiki/Jinmeiy%C5%8D_kanji
- https://www.kanken.or.jp/kanken/outline/data/outline_degree_national_list20200217.pdf

In [20]:
pd_kanjidic.loc[pd_kanjidic['kanji'].isin(
    ['賀', '群', '城', '徳', '富']
), 'grade'] = '4'

pd_kanjidic.loc[pd_kanjidic['kanji'].isin(
    ['囲', '喜', '紀', '救', '型', '航', '告', '殺', '史', '士', '象', '賞', '貯',
     '停', '堂', '得', '毒', '費', '粉', '脈', '歴']
), 'grade'] = '5'

pd_kanjidic.loc[pd_kanjidic['kanji'].isin(
    ['胃', '恩', '券', '承', '舌', '銭', '退', '腸', '敵', '俵', '預']
), 'grade'] = '6'

pd_kanjidic.loc[pd_kanjidic['kanji'] == '渾', 'grade'] = '9'


Check that each grade has the correct number of characters.

In [21]:
assert sum(pd_kanjidic['grade'] == '1') == 80
assert sum(pd_kanjidic['grade'] == '2') == 160
assert sum(pd_kanjidic['grade'] == '3') == 200
assert sum(pd_kanjidic['grade'] == '4') == 202
assert sum(pd_kanjidic['grade'] == '5') == 193
assert sum(pd_kanjidic['grade'] == '6') == 191
assert sum(pd_kanjidic['grade'] == '8') == (2136 - 1026)
assert sum(pd_kanjidic['grade'] == '9') == 651
assert sum(pd_kanjidic['grade'] == '10') == 212

## Export basic Pandas data frame to parquet file

In [22]:
pd_kanjidic.sample(n=10)

Unnamed: 0,kanji,jis208,jis212,jis213,unicode,radical,radical_name,stroke_count,grade,frequency,...,skip_code,onyomi,kunyomi,nanori,meanings,n_onyomi,n_nanori,n_kunyomi,n_kunyomi_distinct,jis_level
9617,簌,,1-50-67,,7C0C,118,,17,,,...,2-6-11,[ソク],[ふる.い],,[(of flower petals) falling],1,0,1,1,0
3118,几,1-49-60,,,51E0,16,[つくえ],2,,,...,4-2-1,[キ],[きにょう],,"[table, table enclosure, table or windy radica...",1,0,1,1,2
1145,鴫,1-28-18,,,9D2B,196,,16,,,...,1-5-11,,[しぎ],,"[snipe, (kokuji)]",0,0,1,1,1
4944,絋,1-69-6,,,7D4B,120,,11,,,...,1-6-5,[コウ],,,[cotton wadding],1,0,0,0,2
3695,怏,1-55-73,,,600F,61,,8,,,...,1-3-5,"[オウ, ヨウ]",[うら.む],,"[dissatisfaction, grudge]",2,0,1,1,2
6732,匑,,1-20-2,,5311,20,,12,,,...,3-2-10,"[キュウ, ク, キク, コク]",[うやま.う],,,4,0,1,1,0
1706,俗,1-34-15,,,4FD7,9,,9,8.0,,...,1-2-7,[ゾク],,,"[vulgar, customs, manners, worldliness, mundan...",1,0,0,0,1
1762,大,1-34-71,,,5927,37,,3,1.0,,...,4-3-4,"[ダイ, タイ]","[おお-, おお.きい, -おお.いに]","[うふ, お, おう, た, たかし, とも, はじめ, ひろ, ひろし, まさ, まさる,...","[large, big]",2,13,3,1,1
9540,笔,,1-49-84,,7B14,118,,10,,,...,2-6-4,[ヒツ],[ふで],,"[writing brush, write, stroke]",1,0,1,1,0
6434,佟,,1-16-80,1-14-17,4F5F,9,,7,,,...,1-2-5,"[トウ, ズ]",,,[name],2,0,0,0,0


In [23]:
pd_kanjidic.to_parquet(f'{DATA_PATH}/kanjidic.parquet')

## Add more data to Kanjidic

### Add Kanken level to grade

#### Kanjidic grade key
- 1-6: usual Japanese school grades
- 8: other jouyou kanji taught in secondary school
- 9: jinmeiyou kanji not included in jouyou
- 10: jinmeiyou kanji that are variants of kanji included in jouyou
- 11: everything else (used to fill the missing values)

In [24]:
kanken = pd.read_csv(f'{DATA_PATH}/kanji_by_kanken_level.csv')
kanken.sample(n=5)

Unnamed: 0,kanji,kanken_level
1987,搏,1.0
406,刈,4.0
1125,嬖,1.0
4120,繹,1.0
4431,艾,1.0


In [25]:
pd_kanjidic = pd_kanjidic.merge(kanken, on='kanji', how='left')

## Create composite grade
A slightly more elaborate classification of kanji based on type, grade and Kanken levels.

In [26]:
def composite_grade(row):
    if row['grade'] in [1, 2, 3, 4, 5, 6]:
        mapping = {
            1: '１年生',
            2: '２年生',
            3: '３年生',
            4: '４年生',
            5: '５年生',
            6: '６年生'
        }
        return mapping[int(row['grade'])]
    elif row['grade'] == 8:
        mapping = {
            2.0: '常用ー漢検２級',
            2.5: '常用ー漢検準２級',
            3.0: '常用ー漢検３級',
            4.0: '常用ー漢検４級'
        }
        return mapping[row['kanken_level']]
    elif row['grade'] in [9, 10]:
        if row['jlpt_level'] == '1':
            return '人名用ー日本語能力試験１級'
        elif row['grade'] == 9:
            return '人名用ー常用以外'
        elif row['grade'] == 10:
            return '人名用ー常用の異体字'
    return '{grade}({kanken})'.format(grade=row['grade'], kanken=row['kanken_level'])

In [27]:
pd_kanjidic['grade'] = pd_kanjidic['grade'].fillna(11).astype(int)

pd_kanjidic['original_grade'] = pd_kanjidic['grade']
pd_kanjidic['grade'] = pd_kanjidic[['grade', 'kanken_level', 'jlpt_level']].apply(composite_grade, axis=1)

In [28]:
pd_kanjidic['grade'].value_counts()

11(nan)          6648
11(1.0)          3165
人名用ー常用以外          400
常用ー漢検準２級          328
常用ー漢検４級           313
11(1.5)           296
常用ー漢検３級           284
人名用ー日本語能力試験１級     251
人名用ー常用の異体字        212
４年生               202
３年生               200
５年生               193
６年生               191
常用ー漢検２級           185
２年生               160
１年生                80
Name: grade, dtype: int64

### Add Jouyou Kanji readings
We want to detect and mark the readings that are not included in the official list.

In [29]:
jouyou = pd.read_csv(f'{DATA_PATH}/Jouyou_Kanj_kanjidb.csv')

jouyou = (
    jouyou[['Kanji', 'Reading within Joyo']]
          .rename({'Kanji': 'kanji', 'Reading within Joyo': 'jouyou_readings'}, axis=1)
)

jouyou['jouyou_readings'] = jouyou['jouyou_readings'].apply(
    lambda x: x.replace('[', '').replace(']', '').replace('-', '.').split('、'))

jouyou.sample(frac=0.005)

Unnamed: 0,kanji,jouyou_readings
66,雲,"[ウン, くも]"
72,営,"[エイ, いとな.む]"
1726,描,"[ビョウ, えが.く, か.く]"
1414,調,"[チョウ, しら.べる, ととの.う, ととの.える]"
766,四,"[シ, よ, よ.つ, よっ.つ, よん]"
2024,絡,"[ラク, から.む, から.まる, から.める]"
1409,跳,"[チョウ, は.ねる, と.ぶ]"
1431,塚,[つか]
1308,滞,"[タイ, とどこお.る]"
945,書,"[ショ, か.く]"


In [30]:
def process_jouyou_readings(row):
    if type(row['jouyou_readings']) is list:
        if row['onyomi'] is not None:
            onyomi_jouyou = []
            onyomi_non_jouyou = []
            for on in row['onyomi']:
                if on in row['jouyou_readings']:
                    onyomi_jouyou.append(on)
                else:
                    onyomi_non_jouyou.append('[{}]'.format(on))
            row['onyomi'] = onyomi_jouyou + onyomi_non_jouyou
            row['n_onyomi_in_jouyou'] = len(onyomi_jouyou)
        
        if row['kunyomi'] is not None:
            kunyomi_jouyou = []
            kunyomi_non_jouyou = []
            for kun in row['kunyomi']:
                if kun in row['jouyou_readings']:
                    kunyomi_jouyou.append(kun)
                else:
                    kunyomi_non_jouyou.append('[{}]'.format(kun))
            row['kunyomi'] = kunyomi_jouyou + kunyomi_non_jouyou
            row['n_kunyomi_in_jouyou'] = len(kunyomi_jouyou)
    return row

In [31]:
pd_kanjidic = pd_kanjidic.merge(jouyou, on='kanji', how='left')

pd_kanjidic['n_onyomi_in_jouyou'] = 0
pd_kanjidic['n_kunyomi_in_jouyou'] = 0

pd_kanjidic = pd_kanjidic.apply(process_jouyou_readings, axis=1)

### Merge Halpern core meanings and radicals

In [32]:
core = pd.read_csv(f'{DATA_PATH}/njecd_c_meanings_radicals.csv')
core = core[['kanji', 'skip_code', 'radical', 'core_meaning_1', 'core_meaning_2', 'core_meaning_3']]

core = core.rename({
    'skip_code': 'skip_code_corrected',
    'radical': 'radical_halpern'
}, axis=1)

assert sum(core.loc[core['kanji'].duplicated(), 'kanji'].notnull()) == 0

core.sample(frac=0.001)

Unnamed: 0,kanji,skip_code_corrected,radical_halpern,core_meaning_1,core_meaning_2,core_meaning_3
3891,遂,3-3-9,162.1,accomplish,,
711,惱,1-3-9,61.1,suffer,,
827,,1-3-10,170.1,apart,,
3874,逮,3-3-8,162.1,catch a criminal,,


In [33]:
pd_kanjidic = pd_kanjidic.merge(core, on='kanji', how='left')

pd_kanjidic['kkd_no'] = pd_kanjidic['kkd_no'].fillna(0).astype(int)
pd_kanjidic['kkld_no'] = pd_kanjidic['kkld_no'].fillna(0).astype(int)

### Merge radical data from radical table

In [34]:
radicals = pd.read_csv(f'{DATA_PATH}/radical_table.csv')

radicals = radicals[['radical_no', 'stroke_count', 'radical', 'names_hiragana']]
radicals = radicals.rename({
    'radical_no': 'radical_halpern',
    'stroke_count': 'rad_stroke_count',
    'radical': 'radical_element',
    'names_hiragana': 'rad_names'
}, axis=1)

radicals.sample(frac=0.02)

Unnamed: 0,radical_halpern,rad_stroke_count,radical_element,rad_names
32,24.0,2,⼗,じゅう
163,111.0,5,⽮,や
61,46.0,3,⼭,やま
90,64.0,4,⼿,て
236,156.0,7,⾛,はしる
0,1.0,1,⼀,いち
299,196.0,11,⿃,とり・とり「へん・づくり」


In [35]:
pd_kanjidic = pd_kanjidic.merge(radicals, on='radical_halpern', how='left')

pd_kanjidic['non_rad_strokes'] = (pd_kanjidic['stroke_count'] - pd_kanjidic['rad_stroke_count']).fillna(0).astype(int)

## Export Kanjidic with additional data

In [36]:
pd_kanjidic.sample(n=10)

Unnamed: 0,kanji,jis208,jis212,jis213,unicode,radical,radical_name,stroke_count,grade,frequency,...,n_kunyomi_in_jouyou,skip_code_corrected,radical_halpern,core_meaning_1,core_meaning_2,core_meaning_3,rad_stroke_count,radical_element,rad_names,non_rad_strokes
12568,䋝,,,2-84-27,42DD,120,,12,11(nan),,...,0,,,,,,,,,0
6255,鵝,1-83-01,,,9D5D,196,,18,11(1.0),,...,0,,,,,,,,,0
3866,掫,1-57-56,,,63AB,64,,11,11(1.0),,...,0,,,,,,,,,0
200,沖,1-18-13,,,6C96,85,,7,４年生,,...,1,1-3-4,85.1,offing,,,3.0,⺡,さんずい,4
789,呼,1-24-38,,,547C,30,,8,６年生,,...,1,1-3-5,30.1,call,,,3.0,「⼝」,くちへん,5
12637,傣,,,2-01-72,50A3,9,,12,11(nan),,...,0,,,,,,,,,0
10992,迕,,1-65-32,2-89-82,8FD5,162,,7,11(nan),,...,0,,,,,,,,,0
2453,物,1-42-10,,,7269,93,,8,３年生,,...,1,1-4-4,93.1,thing,,,4.0,牜,うしへん,4
460,希,1-20-85,,,5E0C,50,,7,４年生,,...,0,2-2-5,50.0,rare,aspire,,3.0,⼱,はば,4
3966,杲,1-58-62,,,6772,75,,8,11(1.0),,...,0,,,,,,,,,0


In [37]:
pd_kanjidic.to_parquet(f'{DATA_PATH}/kanjidic_with_additional_data.parquet')

In [38]:
pd_kanjidic.loc[(pd_kanjidic['original_grade'] == 11) & (pd_kanjidic['core_meaning_1'].notnull())]

Unnamed: 0,kanji,jis208,jis212,jis213,unicode,radical,radical_name,stroke_count,grade,frequency,...,n_kunyomi_in_jouyou,skip_code_corrected,radical_halpern,core_meaning_1,core_meaning_2,core_meaning_3,rad_stroke_count,radical_element,rad_names,non_rad_strokes
11,穐,1-16-12,,,7A50,115,,16,11(1.0),,...,0,1-5-11,115.1,autumn,,,5.0,「禾」,のぎへん,11
26,飴,1-16-27,,,98F4,184,,13,11(1.5),,...,0,1-9-5,184.3,candy,,,9.0,⻞,しょくへん,4
112,嘘,1-17-19,,,5618,30,,14,11(1.5),,...,0,1-3-11,30.1,lie,,,3.0,「⼝」,くちへん,11
114,欝,1-17-21,,,6B1D,75,,26,11(1.0),,...,0,2-12-13,75.0,gloom,,,4.0,⽊,き,22
140,穎,1-17-47,,,7A4E,115,,16,11(1.5),,...,0,1-7-9,115.0,glume,talented,,5.0,⽲,のぎ,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13074,爫,,,2-80-09,FA49,87,,4,11(nan),,...,0,2-1-3,87.1,radical notsu,,,4.0,⺤,つめ「かんむり・がしら」,0
13094,艹,,,2-85-84,FA5D,140,,4,11(nan),,...,0,1-2-2,140.3,radical kusakanmuri,,,4.0,艹,くさかんむり,0
13097,褐,,,1-91-79,FA60,145,,14,11(nan),,...,0,1-5-9,145.1,brown,,,5.0,⻂,ころもへん,9
13103,辶,,,2-89-73,FA66,162,,3,11(nan),,...,0,2-1-2,162.1,radical shinnyō (or shinnyū),,,3.0,⻌,しんにょう・しんにゅう,0


In [39]:
pd_kanjidic.loc[pd_kanjidic['kanken_level'].notnull(), 'jis_level'].value_counts()

2    3390
1    2963
0     107
Name: jis_level, dtype: int64

In [40]:
3390+2962

6352

In [41]:
pd_kanjidic.loc[(pd_kanjidic['original_grade'] == 9), 'kanken_level'].value_counts()

1.5    584
1.0     67
Name: kanken_level, dtype: int64

In [42]:
pd_kanjidic.loc[pd_kanjidic['original_grade'].isin([9, 10]), 'jis_level'].count()

863