# Create kanji flashcards

In [1]:
import datetime
import hashlib
import numpy as np
import pandas as pd

In [2]:
(datetime.date(2021, 12, 1) - datetime.date.today()).days * 20

2720

In [3]:
DATA_PATH = '/Users/gabriele/OneDrive - Tesco/Personal/Data/kanjidata'

## Select flashcard set to produce based on grade

In [4]:
GRADE_SETTINGS = {
    1: '１年生',
    2: '２年生',
    3: '３年生',
    4: '４年生',
    5: '５年生',
    6: '６年生',
    8.1: '常用ー漢検４級',
    8.2: '常用ー漢検３級',
    8.3: '常用ー漢検準２級',
    8.4: '常用ー漢検２級',
    9.1: '人名用ー日本語能力試験１級'
}

SET_NUMBER = 8.4

GRADE = GRADE_SETTINGS[SET_NUMBER]

## Load Kanjidic with additional data

In [5]:
kanji = pd.read_parquet(f'{DATA_PATH}/kanjidic_with_additional_data.parquet')

kanji = kanji[['kanji', 'stroke_count', 'grade', 'jlpt_level', 'onyomi', 'kunyomi',
               'nanori', 'n_onyomi', 'n_onyomi_in_jouyou', 'n_kunyomi', 'n_kunyomi_in_jouyou',
               'n_kunyomi_distinct', 'n_nanori', 'skip_code_corrected', 'kkld_no', 'kkd_no',
               'core_meaning_1', 'core_meaning_2', 'core_meaning_3',
               'radical_element', 'rad_names', 'non_rad_strokes', 'jis_level', 'unicode']]

kanji = kanji.rename({
    'skip_code_corrected': 'skip_code'
}, axis=1)

kanji['jlpt_level'] = kanji['jlpt_level'].fillna(0).astype(int)
kanji['kkld_no'] = kanji['kkld_no'].fillna(0).astype(int)
kanji['kkd_no'] = kanji['kkd_no'].fillna(0).astype(int)

kanji.sample(frac=0.001)

Unnamed: 0,kanji,stroke_count,grade,jlpt_level,onyomi,kunyomi,nanori,n_onyomi,n_onyomi_in_jouyou,n_kunyomi,...,kkld_no,kkd_no,core_meaning_1,core_meaning_2,core_meaning_3,radical_element,rad_names,non_rad_strokes,jis_level,unicode
8015,斦,8,11(nan),0,"[ギン, ゴン, シツ, シチ]",[あき.らか],,4,0,1,...,0,0,,,,,,0,0,65A6
6376,乣,4,11(nan),0,[キュウ],,,1,0,0,...,0,0,,,,,,0,0,4E63
12244,𣏓,7,11(nan),0,,,,0,0,0,...,0,0,,,,,,0,0,233D3
10202,蓀,13,11(nan),0,[ソン],,,1,0,0,...,0,0,,,,,,0,0,84C0
11640,顥,21,11(nan),0,[コウ],[しろ.い],,1,0,1,...,0,0,,,,,,0,0,9865
1953,定,8,３年生,2,"[テイ, ジョウ]","[さだ.める, さだ.まる, さだ.か]",[さた],2,2,3,...,1420,2770,fix,,,⼧,うかんむり,5,1,5B9A
10001,臽,8,11(nan),0,"[カン, ゲン, コン]",[おとしあな],,3,0,1,...,0,0,,,,,,0,0,81FD
8378,櫖,19,11(nan),0,"[リョ, ロ]",[ふざ],,2,0,1,...,0,0,,,,,,0,0,6AD6
6785,叀,8,11(nan),0,[セン],"[つつし.む, か.ける]",,1,0,2,...,0,0,,,,,,0,0,53C0
4300,浚,10,11(1.0),0,[シュン],"[さら.える, さら.う]",,1,0,2,...,0,0,,,,,,0,2,6D5A


## Create the desired set of flashcards

In [6]:
subset = (
    kanji.loc[kanji['grade'] == GRADE]
).copy()

## Expand the readings

In [7]:
def expand_readings(x):
    if x is None:
        return None
    
    if len(x):
        jouyou = []
        non_jouyou = []
        
        for reading in x:
            if '[' in reading:
                non_jouyou.append(reading[1:-1])
            else:
                jouyou.append(reading)

        if len(jouyou) and len(non_jouyou):
            return f"{', '.join(jouyou)}, [{', '.join(non_jouyou)}]"
        elif len(jouyou):
            return ', '.join(jouyou)
        else:
            return f"[{', '.join(non_jouyou)}]"
        
    return None

In [8]:
subset['onyomi'] = subset['onyomi'].apply(expand_readings)
subset['kunyomi'] = subset['kunyomi'].apply(expand_readings)
subset['nanori'] = subset['nanori'].apply(expand_readings)

In [9]:
subset

Unnamed: 0,kanji,stroke_count,grade,jlpt_level,onyomi,kunyomi,nanori,n_onyomi,n_onyomi_in_jouyou,n_kunyomi,...,kkld_no,kkd_no,core_meaning_1,core_meaning_2,core_meaning_3,radical_element,rad_names,non_rad_strokes,jis_level,unicode
6,挨,10,常用ー漢検２級,0,アイ,[ひら.く],,1,1,1,...,0,510,push,,,⺘,てへん,7,1,6328
23,宛,8,常用ー漢検２級,0,[エン],"あ.てる, [-あて, -づつ, あたか.も]",,1,0,4,...,0,2762,address,,,⼧,うかんむり,5,1,5B9B
38,闇,17,常用ー漢検２級,0,"[アン, オン]","やみ, [くら.い]",,2,0,2,...,0,4134,dark,,,⾨,もん（がまえ）・かどがまえ,9,1,95C7
55,椅,12,常用ー漢検２級,0,イ,,,1,1,0,...,0,1239,chair,,,「木」,きへん,8,1,6905
57,畏,9,常用ー漢検２級,0,イ,"おそ.れる, [かしこま.る, かしこ, かしこ.し]",,1,1,4,...,0,3170,be overawed,,,⽥,た,4,1,754F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5699,踪,15,常用ー漢検２級,0,"ソウ, [ショウ]",[あと],,2,1,1,...,0,1996,footprints,,,⻊,あしへん,8,2,8E2A
5759,辣,14,常用ー漢検２級,0,ラツ,[から.い],,1,1,1,...,0,1965,pungent,severe,,⾟,しん・からい,7,2,8FA3
5878,錮,16,常用ー漢検２級,0,コ,[ふさ.ぐ],,1,1,1,...,0,2182,imprison,,,「金」,かねへん,8,2,932E
7126,塡,13,常用ー漢検２級,0,"テン, [チン]","[はま.る, うず.める, は.める, ふさ.ぐ]",,2,1,4,...,0,777,fill,,,「土」,「つち・ど」へん,10,0,5861


## Add unique id

In [10]:
def create_kanji_id(kanji):
    return hashlib.sha1(kanji.encode('utf-8')).hexdigest()

In [11]:
subset['id'] = subset['kanji'].apply(create_kanji_id)

## Export final results

In [12]:
subset = subset[['id', 'kanji', 'stroke_count', 'grade', 'jlpt_level', 'onyomi', 'kunyomi',
                 'nanori', 'n_onyomi', 'n_onyomi_in_jouyou', 'n_kunyomi', 'n_kunyomi_in_jouyou',
                 'n_kunyomi_distinct', 'n_nanori', 'skip_code', 'kkld_no', 'kkd_no',
                 'core_meaning_1', 'core_meaning_2', 'core_meaning_3',
                 'radical_element', 'rad_names', 'non_rad_strokes']]

In [13]:
assert np.sum(subset['id'].duplicated()) == 0

In [14]:
len(subset)

185

In [15]:
subset.sample(n=10)

Unnamed: 0,id,kanji,stroke_count,grade,jlpt_level,onyomi,kunyomi,nanori,n_onyomi,n_onyomi_in_jouyou,...,n_nanori,skip_code,kkld_no,kkd_no,core_meaning_1,core_meaning_2,core_meaning_3,radical_element,rad_names,non_rad_strokes
2023,7a84fc225b6f4e77325e49d2f122927fce5dab91,賭,15,常用ー漢検２級,0,ト,"か.ける, [かけ]",,1,1,...,0,1-7-9,0,2021,wager,,,「貝」,かいへん,8
847,f2717ed4bf371959408d5fd0235d02dd989b9fca,喉,12,常用ー漢検２級,0,コウ,のど,,1,1,...,0,1-3-9,0,669,throat,,,「⼝」,くちへん,9
1629,084876f53d6ef629baa151c8a3176c8dddc5f307,狙,8,常用ー漢検２級,0,"ソ, [ショ]","ねら.う, [ねら.い]",,2,1,...,0,1-3-5,0,406,aim,,,⺨,けものへん,5
1035,fe07b4546e7baedf59ed01a3b2dfb8a85231787a,拶,9,常用ー漢検２級,0,サツ,[せま.る],,1,1,...,0,1-3-6,0,451,press,,,⺘,てへん,6
1971,e3d09e6db771fe73d8b43b81b50bda3af9718fa8,諦,16,常用ー漢検２級,0,"テイ, [タイ]","あきら.める, [つまびらか, まこと]",,2,1,...,0,1-7-9,0,2010,give up,,,「言」,ごんべん,9
873,4a2f2d900c50ad38dffc28860968eace086d2e37,梗,11,常用ー漢検２級,0,"コウ, [キョウ]","[ふさぐ, やまにれ, おおむね]",,2,1,...,0,1-4-7,0,1204,stop up,,,「木」,きへん,7
613,d4cb222f86c963ea2623b3d85f1fc11673c450f2,巾,3,常用ー漢検２級,0,"キン, [フク]","[おお.い, ちきり, きれ]",,2,1,...,0,4-3-3,0,4217,cloth,,,⼱,はば,0
782,39a177844226c13af1a307f423fd8ef654eab7e4,舷,11,常用ー漢検２級,0,ゲン,"[ふなばた, ふなべり]",,1,1,...,0,1-6-5,0,1698,gunwale,,,「⾈」,ふねへん,5
38,0e9f166c7e132f194a7fc4bd27e01274334d94b9,闇,17,常用ー漢検２級,0,"[アン, オン]","やみ, [くら.い]",,2,0,...,0,3-8-9,0,4134,dark,,,⾨,もん（がまえ）・かどがまえ,9
433,f7eec70f868b2c9e4822f503c2dcc2760e332c24,韓,18,常用ー漢検２級,0,カン,"[から, いげた]",,1,1,...,0,1-8-10,0,2218,South Korea,,,⾱,なめしがわ,8


In [16]:
subset.to_csv(f'{DATA_PATH}/kanji_flashcards_{GRADE}.csv', index=False, header=False)

f'{DATA_PATH}/kanji_flashcards_{GRADE}.csv'

'/Users/gabriele/OneDrive - Tesco/Personal/Data/kanjidata/kanji_flashcards_常用ー漢検２級.csv'