# Create kanji flashcards

In [1]:
import datetime
import hashlib
import numpy as np
import pandas as pd

In [2]:
(datetime.date(2022, 7, 1) - datetime.date.today()).days * 20

6380

In [3]:
DATA_PATH = '/Users/glillacci/OneDrive - Tesco/Personal/Data/kanjidata'

## Select flashcard set to produce based on grade

In [4]:
GRADE_SETTINGS = {
    1: '１年生',
    2: '２年生',
    3: '３年生',
    4: '４年生',
    5: '５年生',
    6: '６年生',
    8.1: '常用ー漢検４級',
    8.2: '常用ー漢検３級',
    8.3: '常用ー漢検準２級',
    8.4: '常用ー漢検２級',
    9.1: '人名用ー日本語能力試験１級'
}

SET_NUMBER = 9.1

GRADE = GRADE_SETTINGS[SET_NUMBER]

## Load Kanjidic with additional data

In [5]:
kanji = pd.read_parquet(f'{DATA_PATH}/kanjidic_with_additional_data.parquet')

kanji = kanji[['kanji', 'stroke_count', 'grade', 'jlpt_level', 'onyomi', 'kunyomi',
               'nanori', 'n_onyomi', 'n_onyomi_in_jouyou', 'n_kunyomi', 'n_kunyomi_in_jouyou',
               'n_kunyomi_distinct', 'n_nanori', 'skip_code_corrected', 'kkld_no', 'kkd_no',
               'core_meaning_1', 'core_meaning_2', 'core_meaning_3',
               'radical_element', 'rad_names', 'non_rad_strokes', 'jis_level', 'unicode']]

kanji = kanji.rename({
    'skip_code_corrected': 'skip_code'
}, axis=1)

kanji['jlpt_level'] = kanji['jlpt_level'].fillna(0).astype(int)
kanji['kkld_no'] = kanji['kkld_no'].fillna(0).astype(int)
kanji['kkd_no'] = kanji['kkd_no'].fillna(0).astype(int)

kanji.sample(n=10)

Unnamed: 0,kanji,stroke_count,grade,jlpt_level,onyomi,kunyomi,nanori,n_onyomi,n_onyomi_in_jouyou,n_kunyomi,...,kkld_no,kkd_no,core_meaning_1,core_meaning_2,core_meaning_3,radical_element,rad_names,non_rad_strokes,jis_level,unicode
12284,𤎼,16,11(nan),0,,,,0,0,0,...,0,0,,,,,,0,0,243BC
9247,眯,11,11(nan),0,"[ベイ, マイ, ビ, ミ]",[くる.む],,4,0,1,...,0,0,,,,,,0,0,772F
2473,幣,15,常用ー漢検準２級,1,[ヘイ],[[ぬさ]],[しで],1,1,1,...,1844,3582,currency,,,⼱,はば,12,1,5E63
10243,蕐,15,11(nan),0,"[カ, ケ]",[はな],,2,0,1,...,0,0,,,,,,0,0,8550
1969,艇,13,常用ー漢検準２級,1,[テイ],,,1,1,0,...,923,1727,boat,,,「⾈」,ふねへん,7,1,8247
7490,帔,8,11(nan),0,[ヒ],[ふくろ],,1,0,1,...,0,0,,,,,,0,0,5E14
5494,襁,16,11(1.0),0,[キョウ],[むつき],,1,0,1,...,0,0,,,,,,0,2,8941
3632,廬,19,11(1.0),0,"[ロ, リョ]","[いお, いおり, いえ]",,2,0,3,...,0,0,,,,,,0,2,5EEC
9147,瘀,13,11(nan),0,"[ヨ, オ]",[やまい],,2,0,1,...,0,0,,,,,,0,0,7600
8266,椸,13,11(nan),0,[イ],[ころもかけ],,1,0,1,...,0,0,,,,,,0,0,6938


In [6]:
kanji.columns

Index(['kanji', 'stroke_count', 'grade', 'jlpt_level', 'onyomi', 'kunyomi',
       'nanori', 'n_onyomi', 'n_onyomi_in_jouyou', 'n_kunyomi',
       'n_kunyomi_in_jouyou', 'n_kunyomi_distinct', 'n_nanori', 'skip_code',
       'kkld_no', 'kkd_no', 'core_meaning_1', 'core_meaning_2',
       'core_meaning_3', 'radical_element', 'rad_names', 'non_rad_strokes',
       'jis_level', 'unicode'],
      dtype='object')

## Create the desired set of flashcards

In [7]:
subset = (
    kanji.loc[kanji['grade'] == GRADE]
).copy()

## Expand the readings

In [8]:
def expand_readings(x):
    if x is None:
        return None
    
    if len(x):
        jouyou = []
        non_jouyou = []
        
        for reading in x:
            if '[' in reading:
                non_jouyou.append(reading[1:-1])
            else:
                jouyou.append(reading)

        if len(jouyou) and len(non_jouyou):
            return f"{', '.join(jouyou)}, [{', '.join(non_jouyou)}]"
        elif len(jouyou):
            return ', '.join(jouyou)
        else:
            return f"[{', '.join(non_jouyou)}]"
        
    return None

In [9]:
subset['onyomi'] = subset['onyomi'].apply(expand_readings)
subset['kunyomi'] = subset['kunyomi'].apply(expand_readings)
subset['nanori'] = subset['nanori'].apply(expand_readings)

In [10]:
subset

Unnamed: 0,kanji,stroke_count,grade,jlpt_level,onyomi,kunyomi,nanori,n_onyomi,n_onyomi_in_jouyou,n_kunyomi,...,kkld_no,kkd_no,core_meaning_1,core_meaning_2,core_meaning_3,radical_element,rad_names,non_rad_strokes,jis_level,unicode
3,阿,8,人名用ー日本語能力試験１級,1,"ア, オ","おもね.る, くま","ほとり, あず, あわ, おか, きた, な",2,0,2,...,256,408,phonetic [a],,,⻖,こざとへん,5,1,963F
9,葵,12,人名用ー日本語能力試験１級,1,キ,あおい,"まもる, け",1,0,1,...,1493,2906,mallow,,,⺾,くさかんむり,9,1,8475
10,茜,9,人名用ー日本語能力試験１級,1,セン,あかね,,1,0,1,...,1448,2811,madder,,,⺾,くさかんむり,6,1,831C
14,渥,12,人名用ー日本語能力試験１級,1,アク,"あつ.い, うるお.う",あつし,1,0,2,...,435,731,gracious,,,⺡,さんずい,9,1,6E25
15,旭,6,人名用ー日本語能力試験１級,1,キョク,あさひ,"あきら, あき, てる, ひ",1,0,1,...,1890,3697,rising sun,,,⽇,ひ・にち,2,1,65ED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6055,頌,13,人名用ー日本語能力試験１級,1,"ショウ, ジュ, ヨウ","かたち, たた.える, ほめ.る","つぐ, のぶ",3,0,3,...,715,1319,eulogize,,,⾴,おおがい・いちのかい,4,2,980C
6071,颯,14,人名用ー日本語能力試験１級,1,"サツ, ソウ",さっ.と,,2,0,1,...,825,1563,sound of gusting wind,,,⾵,かぜ,5,2,98AF
6307,黎,15,人名用ー日本語能力試験１級,1,"レイ, リ",くろ.い,れ,2,0,1,...,1799,3487,black,,,⿉,きび,3,2,9ECE
6353,凜,15,人名用ー日本語能力試験１級,1,リン,きびし.い,,1,0,1,...,126,195,severely cold,,,⼎,にすい,13,2,51DC


## Add unique id

In [11]:
def create_kanji_id(kanji):
    return hashlib.sha1(kanji.encode('utf-8')).hexdigest()

In [12]:
subset['id'] = subset['kanji'].apply(create_kanji_id)

## Export final results

In [13]:
subset = subset[['id', 'kanji', 'stroke_count', 'grade', 'jlpt_level', 'onyomi', 'kunyomi',
                 'nanori', 'n_onyomi', 'n_onyomi_in_jouyou', 'n_kunyomi', 'n_kunyomi_in_jouyou',
                 'n_kunyomi_distinct', 'n_nanori', 'skip_code', 'kkld_no', 'kkd_no',
                 'core_meaning_1', 'core_meaning_2', 'core_meaning_3',
                 'radical_element', 'rad_names', 'non_rad_strokes']]

In [14]:
assert np.sum(subset['id'].duplicated()) == 0

In [15]:
len(subset)

251

In [17]:
subset.sample(n=10)

Unnamed: 0,id,kanji,stroke_count,grade,jlpt_level,onyomi,kunyomi,nanori,n_onyomi,n_onyomi_in_jouyou,...,n_nanori,skip_code,kkld_no,kkd_no,core_meaning_1,core_meaning_2,core_meaning_3,radical_element,rad_names,non_rad_strokes
1284,55e93b8aa7254074ad8b73476e720c8bfc849a3a,淳,11,人名用ー日本語能力試験１級,1,"ジュン, シュン",あつ.い,"あつ, あつし, きよ, きよし, まこと, すなお",2,0,...,6,1-3-8,378,626,purehearted,,,⺡,さんずい,8
180,071ddf58a8aa38cfcdd22c4360ca2b4f93c8dc02,於,8,人名用ー日本語能力試験１級,1,"オ, ヨ","おい.て, お.ける, ああ, より",,2,0,...,0,1-4-4,571,1059,at,,,「方」,「ほう・かた」へん,4
868,cb2747af180c20566c7b0e28cf16afe6df212c30,昂,8,人名用ー日本語能力試験１級,1,"コウ, ゴウ","あ.がる, たか.い, たか.ぶる","あき, あきら, たか, たかし, のぼる",2,0,...,5,2-4-4,1562,3065,high,,,⽇,ひ・にち,4
83,d18f0d32937e6118c1ed92b734fdda07edad31c8,允,4,人名用ー日本語能力試験１級,1,イン,"じょう, まこと.に, ゆるす","まこと, のぶ, まさ, みつ, すけ, よし, ちか, とも",1,0,...,8,2-2-2,1252,2476,give consent,,,⼉,にんにょう・ひとあし,2
1871,3e2d3914a7d30479050e2da9d8bf0bb3070024d1,猪,11,人名用ー日本語能力試験１級,1,チョ,"い, いのしし",いの,1,0,...,1,1-3-8,392,652,wild boar,,,⺨,けものへん,8
2331,38497fe8b31490014540f48cc1478ec944d9c3f6,緋,14,人名用ー日本語能力試験１級,1,ヒ,"あけ, あか",,1,0,...,0,1-6-8,925,1732,scarlet,,,「糸」,いとへん,8
2236,6fd777e1bf155d4b9e8b4325ee4ead57c9c3a986,萩,12,人名用ー日本語能力試験１級,1,シュウ,はぎ,は,1,0,...,1,2-3-9,1495,2908,hagi,,,⺾,くさかんむり,9
1186,dc2b0c9c59ec6f268ade294e9700e16063c82155,勺,3,人名用ー日本語能力試験１級,1,シャク,,,1,0,...,0,3-2-1,1863,3650,shaku,,,⼓,「つつみ・く・ほう」がまえ,1
2282,18ac806e759d109dc6b1ae3f33107424c8ad2a08,隼,10,人名用ー日本語能力試験１級,1,"シュン, ジュン",はやぶさ,はや,2,0,...,1,2-8-2,1766,3427,falcon,,,⾫,ふるとり,2
979,a9174a11a2935f6ecd8152a6c4e72b644e60f4d9,哉,9,人名用ー日本語能力試験１級,1,サイ,"かな, や","か, すけ, とし, ちか, はじめ",1,0,...,5,3-6-3,2071,4084,exclamatory particle,,,⼝,くち,6


In [None]:
subset.to_csv(f'{DATA_PATH}/kanji_flashcards_{GRADE}.csv', index=False, header=False)

f'{DATA_PATH}/kanji_flashcards_{GRADE}.csv'