# Create kanji flashcards

In [1]:
import datetime
import hashlib
import numpy as np
import pandas as pd

In [2]:
(datetime.date(2021, 12, 1) - datetime.date.today()).days * 20

2860

## Select flashcard set to produce based on grade

In [3]:
GRADE_SETTINGS = {
    1: '１年生',
    2: '２年生',
    3: '３年生',
    4: '４年生',
    5: '５年生',
    6: '６年生',
    8.1: '常用ー漢検４級',
    8.2: '常用ー漢検３級',
    8.3: '常用ー漢検準２級',
    8.4: '常用ー漢検２級',
    9.1: '人名用ー日本語能力試験１級'
}

SET_NUMBER = 8.4

GRADE = GRADE_SETTINGS[SET_NUMBER]

## Load Kanjidic with additional data

In [4]:
kanji = pd.read_parquet('./data/kanjidic_with_additional_data.parquet')

kanji = kanji[['kanji', 'stroke_count', 'grade', 'jlpt_level', 'onyomi', 'kunyomi',
               'nanori', 'n_onyomi', 'n_onyomi_in_jouyou', 'n_kunyomi', 'n_kunyomi_in_jouyou',
               'n_kunyomi_distinct', 'n_nanori', 'skip_code_corrected', 'kkld_no', 'kkd_no',
               'core_meaning_1', 'core_meaning_2', 'core_meaning_3',
               'radical_element', 'rad_names', 'non_rad_strokes', 'jis_level', 'unicode']]

kanji = kanji.rename({
    'skip_code_corrected': 'skip_code'
}, axis=1)

kanji['jlpt_level'] = kanji['jlpt_level'].fillna(0).astype(int)
kanji['kkld_no'] = kanji['kkld_no'].fillna(0).astype(int)
kanji['kkd_no'] = kanji['kkd_no'].fillna(0).astype(int)

kanji.sample(frac=0.001)

Unnamed: 0,kanji,stroke_count,grade,jlpt_level,onyomi,kunyomi,nanori,n_onyomi,n_onyomi_in_jouyou,n_kunyomi,...,kkld_no,kkd_no,core_meaning_1,core_meaning_2,core_meaning_3,radical_element,rad_names,non_rad_strokes,jis_level,unicode
11010,逩,11,11(nan),0,"[ホン, カム]",[はし.る],,2,0,1,...,0,0,,,,,,0,0,9029
4285,泪,8,11(1.0),0,"[ルイ, レイ]",[なみだ],,2,0,1,...,0,403,tear,,,⺡,さんずい,5,2,6CEA
7650,恱,9,11(nan),0,"[エツ, エチ]","[よろこ.ぶ, よろこ.ばす]","[のぶ, よし]",2,0,2,...,0,0,,,,,,0,0,6071
11034,邌,18,11(nan),0,"[レイ, ライ, チ, ジ]","[ね.る, おもむろ, おそい]",,4,0,3,...,0,0,,,,,,0,0,908C
5850,鈞,12,11(1.0),0,[キン],[ひと.しい],,1,0,1,...,0,0,,,,,,0,2,921E
6475,俉,9,11(nan),0,[ゴ],"[むか.える, あ.う]",,1,0,2,...,0,0,,,,,,0,0,4FC9
1757,隊,12,４年生,1,[タイ],,,1,1,0,...,452,762,party,,,⻖,こざとへん,9,1,968A
11855,魿,16,11(nan),0,"[レイ, リョウ, リン]","[う.ねる, うろこ]",,3,0,2,...,0,0,,,,,,0,0,9B7F
12415,𨫍,18,11(nan),0,,,,0,0,0,...,0,0,,,,,,0,0,28ACD
11152,釓,9,11(nan),0,"[ヒュウ, グ, キュウ, ワン]",[いしゆみ],,4,0,1,...,0,0,,,,,,0,0,91D3


## Create the desired set of flashcards

In [5]:
subset = (
    kanji.loc[kanji['grade'] == GRADE]
).copy()

## Expand the readings

In [6]:
def expand_readings(x):
    if x is None:
        return None
    
    if len(x):
        jouyou = []
        non_jouyou = []
        
        for reading in x:
            if '[' in reading:
                non_jouyou.append(reading[1:-1])
            else:
                jouyou.append(reading)

        if len(jouyou) and len(non_jouyou):
            return f"{', '.join(jouyou)}, [{', '.join(non_jouyou)}]"
        elif len(jouyou):
            return ', '.join(jouyou)
        else:
            return f"[{', '.join(non_jouyou)}]"
        
    return None

In [7]:
subset['onyomi'] = subset['onyomi'].apply(expand_readings)
subset['kunyomi'] = subset['kunyomi'].apply(expand_readings)
subset['nanori'] = subset['nanori'].apply(expand_readings)

In [8]:
subset

Unnamed: 0,kanji,stroke_count,grade,jlpt_level,onyomi,kunyomi,nanori,n_onyomi,n_onyomi_in_jouyou,n_kunyomi,...,kkld_no,kkd_no,core_meaning_1,core_meaning_2,core_meaning_3,radical_element,rad_names,non_rad_strokes,jis_level,unicode
6,挨,10,常用ー漢検２級,0,アイ,[ひら.く],,1,1,1,...,0,510,push,,,⺘,てへん,7,1,6328
23,宛,8,常用ー漢検２級,0,[エン],"あ.てる, [-あて, -づつ, あたか.も]",,1,0,4,...,0,2762,address,,,⼧,うかんむり,5,1,5B9B
38,闇,17,常用ー漢検２級,0,"[アン, オン]","やみ, [くら.い]",,2,0,2,...,0,4134,dark,,,⾨,もん（がまえ）・かどがまえ,9,1,95C7
55,椅,12,常用ー漢検２級,0,イ,,,1,1,0,...,0,1239,chair,,,「木」,きへん,8,1,6905
57,畏,9,常用ー漢検２級,0,イ,"おそ.れる, [かしこま.る, かしこ, かしこ.し]",,1,1,4,...,0,3170,be overawed,,,⽥,た,4,1,754F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5699,踪,15,常用ー漢検２級,0,"ソウ, [ショウ]",[あと],,2,1,1,...,0,1996,footprints,,,⻊,あしへん,8,2,8E2A
5759,辣,14,常用ー漢検２級,0,ラツ,[から.い],,1,1,1,...,0,1965,pungent,severe,,⾟,しん・からい,7,2,8FA3
5878,錮,16,常用ー漢検２級,0,コ,[ふさ.ぐ],,1,1,1,...,0,2182,imprison,,,「金」,かねへん,8,2,932E
7126,塡,13,常用ー漢検２級,0,"テン, [チン]","[はま.る, うず.める, は.める, ふさ.ぐ]",,2,1,4,...,0,777,fill,,,「土」,「つち・ど」へん,10,0,5861


## Add unique id

In [9]:
def create_kanji_id(kanji):
    return hashlib.sha1(kanji.encode('utf-8')).hexdigest()

In [10]:
subset['id'] = subset['kanji'].apply(create_kanji_id)

## Export final results

In [11]:
subset = subset[['id', 'kanji', 'stroke_count', 'grade', 'jlpt_level', 'onyomi', 'kunyomi',
                 'nanori', 'n_onyomi', 'n_onyomi_in_jouyou', 'n_kunyomi', 'n_kunyomi_in_jouyou',
                 'n_kunyomi_distinct', 'n_nanori', 'skip_code', 'kkld_no', 'kkd_no',
                 'core_meaning_1', 'core_meaning_2', 'core_meaning_3',
                 'radical_element', 'rad_names', 'non_rad_strokes']]

In [12]:
assert np.sum(subset['id'].duplicated()) == 0

In [13]:
len(subset)

185

In [14]:
subset.sample(n=10)

Unnamed: 0,id,kanji,stroke_count,grade,jlpt_level,onyomi,kunyomi,nanori,n_onyomi,n_onyomi_in_jouyou,...,n_nanori,skip_code,kkld_no,kkd_no,core_meaning_1,core_meaning_2,core_meaning_3,radical_element,rad_names,non_rad_strokes
1207,c270bd684832c0e9844ee597558fc372720b2c6a,腫,13,常用ー漢検２級,0,"シュ, [ショウ]","は.れる, は.らす, [は.れ, はれもの]",,2,1,...,0,1-4-9,0,1310,swelling,,,⺝,にくづき,9
2489,468972b61557e2eb4dcc98429482cc5e48eeb6fc,蔑,14,常用ー漢検２級,0,ベツ,"さげす.む, [ないがしろ, なみ.する, くらい]",,1,1,...,0,2-3-11,0,2969,despise,,,⺾,くさかんむり,11
782,39a177844226c13af1a307f423fd8ef654eab7e4,舷,11,常用ー漢検２級,0,ゲン,"[ふなばた, ふなべり]",,1,1,...,0,1-6-5,0,1698,gunwale,,,「⾈」,ふねへん,5
2348,89254e5b9f84d28a130ce33e11e3a089333e05e1,眉,9,常用ー漢検２級,1,"ビ, ミ",まゆ,,2,2,...,0,3-4-5,2050,3991,eyebrow,,,⽬,め,4
341,5e3ee7e4c87ee38df8b086b9eb094e85535138bb,顎,18,常用ー漢検２級,0,ガク,"あご, [あぎと]",,1,1,...,0,1-9-9,0,2276,jaw,,,⾴,おおがい・いちのかい,9
2789,8effdb4aa54198fff6cf94108cd2dd763092a6a1,沃,7,常用ー漢検２級,0,"ヨク, [ヨウ, オク]",[そそ.ぐ],,3,1,...,0,1-3-4,0,315,fertile,,,⺡,さんずい,4
2705,206518af031f85c7c21775b902d660252bc0c569,冶,7,常用ー漢検２級,1,ヤ,[い.る],じ,1,1,...,1,1-2-5,58,89,work metals,,,⼎,にすい,5
703,8aa50bc9ef758b5953e187e6072f5d437064a637,稽,15,常用ー漢検２級,0,ケイ,"[かんが.える, とど.める]",,1,1,...,0,1-5-10,0,1573,think,,,「禾」,のぎへん,10
2292,7820629b6acc9b6a9746ed74bd64c50f9cccd30b,氾,5,常用ー漢検２級,0,ハン,[ひろ.がる],,1,1,...,0,1-3-2,0,226,spread about,,,⺡,さんずい,2
1035,fe07b4546e7baedf59ed01a3b2dfb8a85231787a,拶,9,常用ー漢検２級,0,サツ,[せま.る],,1,1,...,0,1-3-6,0,451,press,,,⺘,てへん,6


In [15]:
subset.to_csv('./data/kanji_flashcards_{}.csv'.format(GRADE), index=False, header=False)

'./data/kanji_flashcards_{}.csv'.format(GRADE)

'./data/kanji_flashcards_常用ー漢検２級.csv'