# Find kanji in White Rabbit Press flashcards

In [1]:
import numpy as np
import pandas as pd
import re

pd.set_option('display.max_rows', 500)

## Load data

### Digitized White Rabbit Press (old edition) index

In [2]:
df_wro = pd.read_csv('./data/white_rabbit_old.csv')
df_wro['wr_old'] = df_wro['wr_old'].astype(int)

### Kanji by new JLPT level

In [3]:
df_jlpt = pd.read_csv('./data/kanji_by_new_jlpt_level.csv')

### Kanjidic with additional data

In [4]:
kanji = pd.read_parquet('./data/kanjidic_with_additional_data.parquet')
kanji = kanji[['kanji', 'grade', 'jlpt_level', 'stroke_count', 'onyomi', 'kunyomi', 'nanori',
               'core_meaning_1', 'core_meaning_2', 'core_meaning_3', 'original_grade']]

In [5]:
kanji['jlpt_level'] = kanji['jlpt_level'].fillna(0).astype(int)
kanji['original_grade'] = kanji['original_grade'].fillna(11).astype(int)

In [6]:
kanji

Unnamed: 0,kanji,grade,jlpt_level,stroke_count,onyomi,kunyomi,nanori,core_meaning_1,core_meaning_2,core_meaning_3,original_grade
0,亜,常用ー漢検準２級,1,7,[ア],[[つ.ぐ]],"[や, つぎ, つぐ]",sub-,phonetic [a],,8
1,唖,11.0(1.5),0,10,"[ア, アク]",[おし],,,,,11
2,娃,9.0(1.5),0,9,"[ア, アイ, ワ]",[うつく.しい],[い],beautiful,,,9
3,阿,9.0(1.5),1,8,"[ア, オ]","[おもね.る, くま]","[ほとり, あず, あわ, おか, きた, な]",phonetic [a],,,9
4,哀,常用ー漢検３級,1,9,[アイ],"[あわ.れ, あわ.れむ, [かな.しい]]",,sorrow,pity,,8
...,...,...,...,...,...,...,...,...,...,...,...
13103,辶,11.0(nan),0,3,,[しんにょう],,radical shinnyō (or shinnyū),,,11
13104,逸,10.0(nan),0,12,,[しんにょう],,let slip,,,10
13105,難,10.0(nan),0,19,[ナン],"[かたい, むずかしい]",,difficult,disaster,,10
13106,響,10.0(nan),0,22,[キョウ],[ひびく],,reverberate,,,10


### JMdict

In [7]:
jmdict = pd.read_parquet('./data/jmdict.parquet')
jmdict = jmdict.loc[jmdict['sense_no'] == 1,
                    ['main_spelling', 'main_reading', 'meaning', 'part_of_speech', 'remarks']]
jmdict.sample(frac=0.001)

Unnamed: 0,main_spelling,main_reading,meaning,part_of_speech,remarks
157124,愛敬付く,あいぎょうづく,"to have a charming face, voice, personality, e...",Godan verb with `ku' ending,
106078,研修旅行,けんしゅうりょこう,study tour,noun (common) (futsuumeishi),
65552,逆作用,ぎゃくさよう,"reaction, adverse effect",noun (common) (futsuumeishi),
125032,割付け配列,わりつけはいれつ,allocatable array,noun (common) (futsuumeishi),
75127,労働協約,ろうどうきょうやく,"labor agreement, labour agreement",noun (common) (futsuumeishi),
102537,同法,どうほう,"same law, same method",noun (common) (futsuumeishi),
47051,夫婦連れ,ふうふづれ,husband and wife travelling together (traveling),noun (common) (futsuumeishi),
27654,細菌,さいきん,"bacterium, bacteria, germ",noun (common) (futsuumeishi),
127662,著作権図書館,ちょさくけんとしょかん,copyright library,noun (common) (futsuumeishi),
129715,ホモ達,ホモだち,homosexual friend,noun (common) (futsuumeishi),"slang, jocular, humorous term"


### Word frequencies from Wikipedia (2015)

In [8]:
freqs = pd.read_csv('./data/wikipedia-20150422-lemmas.tsv', delimiter='\t',
                    header=None, names=['rank', 'frequency', 'main_spelling'])
freqs.head()

Unnamed: 0,rank,frequency,main_spelling
0,1,34943515,の
1,2,19609102,に
2,3,17575121,する
3,4,16805949,は
4,5,16443314,を


## Data checks on `df_wro`

### Check for duplicates

In [9]:
df_wro[df_wro.duplicated(subset='kanji')]

Unnamed: 0,kanji,wr_old,strokes


### Detect missing numbers

In [10]:
for i in range(max(df_wro['wr_old'])+1):
    if i not in df_wro['wr_old'].values:
        print(i)

0


## Find kanji to study for the JLPT N1
I will assume that one needs all the current jouyou kanji (2136) plus the jinmeyou kanji that were needed for the old level 1 (251).

In [11]:
sum(kanji['original_grade'].isin([1, 2, 3, 4, 5, 6, 8]))

2136

In [12]:
sum(kanji['jlpt_level'].isin([1, 2, 3, 4]) & ~kanji['original_grade'].isin([1, 2, 3, 4, 5, 6, 8]))

251

In [13]:
selection = (
    (kanji['original_grade'].isin([1, 2, 3, 4, 5, 6, 8])) |
    (
        (kanji['jlpt_level'].isin([1, 2, 3, 4]) &
         ~kanji['original_grade'].isin([1, 2, 3, 4, 5, 6, 8]))
    )
)

sum(selection)

2387

In [14]:
kanji = kanji.loc[selection]

In [15]:
kanji['grade'].value_counts()

常用ー漢検準２級    328
常用ー漢検４級     313
常用ー漢検３級     284
9.0(1.5)    206
４年生         202
３年生         200
５年生         193
６年生         191
常用ー漢検２級     185
２年生         160
１年生          80
9.0(1.0)     40
9.0(nan)      5
Name: grade, dtype: int64

## Join the lists to obtain a cross-reference dataframe
The following df can be used as a cross-reference between Japanese school grade and the old White Rabbit Press Kanji flashcards.

In [16]:
df = kanji.merge(df_jlpt, how='left', on='kanji')
df = df.merge(df_wro, how='left', on='kanji')

In [17]:
df['wr_old'] = df['wr_old'].fillna(0).astype(int)
df['strokes'] = df['strokes'].fillna(df['stroke_count']).astype(int)

## Detect missing kanji in the WRP flashcards

In [18]:
sum(df['wr_old'] == 0)

461

In [19]:
df.loc[df['wr_old'] == 0, 'grade'].value_counts()

9.0(1.5)    206
常用ー漢検２級     185
9.0(1.0)     40
４年生          11
常用ー漢検準２級     10
9.0(nan)      5
常用ー漢検４級       3
常用ー漢検３級       1
Name: grade, dtype: int64

<hr>

## Find WRP flashcard by grade

In [22]:
# key = df.loc[(df['grade'] == '４年生')].sort_values('wr_old')
# filename = 'words_4nensei'

# key = df.loc[(df['grade'].isin(['常用ー漢検４級', '常用ー漢検３級', '常用ー漢検準２級']))].sort_values('wr_old')
# filename = 'words_8_123'

key = df.loc[(df['grade'] == '常用ー漢検２級')].sort_values('wr_old')
filename = 'words_8_4'

key

Unnamed: 0,kanji,grade,jlpt_level,stroke_count,onyomi,kunyomi,nanori,core_meaning_1,core_meaning_2,core_meaning_3,original_grade,new_jlpt_level,wr_old,strokes
4,挨,常用ー漢検２級,0,10,[アイ],[[ひら.く]],,push,,,8,,0,10
1757,剥,常用ー漢検２級,0,10,"[ハク, [ホク]]","[は.がれる, は.ぐ, は.げる, は.がす, [へ.ぐ], [へず.る], [む.く],...",,peel off,,,8,,0,10
1770,箸,常用ー漢検２級,0,15,"[[チョ], [チャク]]",[はし],,chopsticks,,,8,,0,15
1790,斑,常用ー漢検２級,0,12,[ハン],"[[ふ], [まだら]]",[い],spot,,,8,,0,12
1792,氾,常用ー漢検２級,0,5,[ハン],[[ひろ.がる]],,spread about,,,8,,0,5
1793,汎,常用ー漢検２級,0,6,"[ハン, [ブ], [フウ], [ホウ], [ホン]]","[[ただよ.う], [ひろ.い]]","[ひろ, ひろし, みな]",overall,,,8,,0,6
1836,眉,常用ー漢検２級,1,9,"[ビ, ミ]",[まゆ],,eyebrow,,,8,N1,0,9
1842,膝,常用ー漢検２級,0,15,[[シツ]],[ひざ],,knee,,,8,,0,15
1843,肘,常用ー漢検２級,0,7,[[チュウ]],[ひじ],,elbow,,,8,,0,7
1932,蔽,常用ー漢検２級,0,15,"[ヘイ, [ヘツ], [フツ]]","[[おお.う], [おお.い]]",,cover,,,8,,0,15


In [23]:
len(key)

185

## Produce data file for missing cards to supplement

### Initial set to supplement

In [24]:
supplement = key.loc[key['wr_old'] == 0].copy()

In [25]:
list(supplement['kanji'])

['挨',
 '剥',
 '箸',
 '斑',
 '氾',
 '汎',
 '眉',
 '膝',
 '肘',
 '蔽',
 '蔑',
 '蜂',
 '貌',
 '睦',
 '勃',
 '昧',
 '枕',
 '蜜',
 '冥',
 '麺',
 '罵',
 '餅',
 '捻',
 '匂',
 '綻',
 '酎',
 '捗',
 '椎',
 '潰',
 '爪',
 '鶴',
 '諦',
 '溺',
 '貼',
 '妬',
 '賭',
 '藤',
 '憧',
 '瞳',
 '頓',
 '那',
 '謎',
 '鍋',
 '虹',
 '冶',
 '弥',
 '湧',
 '慄',
 '憬',
 '拉',
 '摯',
 '曖',
 '楷',
 '鬱',
 '璧',
 '瘍',
 '箋',
 '籠',
 '緻',
 '羞',
 '訃',
 '諧',
 '貪',
 '踪',
 '辣',
 '錮',
 '惧',
 '恣',
 '彙',
 '毀',
 '妖',
 '沃',
 '嵐',
 '藍',
 '璃',
 '侶',
 '瞭',
 '瑠',
 '呂',
 '旦',
 '賂',
 '麓',
 '脇',
 '丼',
 '傲',
 '刹',
 '哺',
 '喩',
 '嗅',
 '嘲',
 '弄',
 '塡',
 '誰',
 '堆',
 '釜',
 '鎌',
 '瓦',
 '韓',
 '玩',
 '伎',
 '畿',
 '亀',
 '僅',
 '巾',
 '錦',
 '駒',
 '串',
 '窟',
 '稽',
 '詣',
 '隙',
 '桁',
 '拳',
 '葛',
 '鍵',
 '顎',
 '骸',
 '宛',
 '闇',
 '椅',
 '畏',
 '萎',
 '咽',
 '淫',
 '臼',
 '唄',
 '餌',
 '怨',
 '艶',
 '旺',
 '臆',
 '俺',
 '苛',
 '牙',
 '崖',
 '蓋',
 '柿',
 '舷',
 '股',
 '虎',
 '凄',
 '醒',
 '戚',
 '脊',
 '煎',
 '羨',
 '腺',
 '詮',
 '膳',
 '曽',
 '狙',
 '遡',
 '爽',
 '痩',
 '捉',
 '袖',
 '遜',
 '汰',
 '唾',
 '裾',
 '須',
 '腎',
 '芯',
 '乞',
 '勾',
 '喉',
 '梗'

In [27]:
# supplement[['kanji', 'grade']].sort_values('grade')

### Process the readings
This ensures that they look nice, i.e. "approved, [unapproved]".

In [28]:
def expand_readings(x):
    if x is None:
        return None
    
    if len(x):
        jouyou = []
        non_jouyou = []
        
        for reading in x:
            if '[' in reading:
                non_jouyou.append(reading[1:-1])
            else:
                jouyou.append(reading)

        if len(jouyou) and len(non_jouyou):
            return f"{', '.join(jouyou)}, [{', '.join(non_jouyou)}]"
        elif len(jouyou):
            return ', '.join(jouyou)
        else:
            return f"[{', '.join(non_jouyou)}]"
        
    return None

supplement['onyomi'] = supplement['onyomi'].apply(expand_readings)
supplement['kunyomi'] = supplement['kunyomi'].apply(expand_readings)
supplement['nanori'] = supplement['nanori'].apply(expand_readings)

### Merge the core meanings into one string

In [29]:
supplement['core_meanings'] = supplement.apply(
    lambda row: ', '.join(filter(None, [row[f'core_meaning_{i}'] for i in [1, 2, 3]])), axis=1)

supplement = supplement.drop([f'core_meaning_{i}' for i in [1, 2, 3]], axis=1)

### Add some example compounds

Get the most frequent words from JMdict.

In [30]:
df = jmdict.merge(freqs, on='main_spelling', how='left')
df = df.loc[df['main_spelling'].str.len() > 1].sort_values('rank').copy()

Find the kanji for each word.

In [31]:
kanji_block = r'[㐀-䶵一-鿋豈-頻]'
df['kanji'] = df['main_spelling'].apply(lambda x: re.findall(kanji_block, x))

df = df.explode('kanji', ignore_index=True)

Merge the kanji in the supplement to the words. This produces a list of candidate words to be used as example compounds. Some manual editing is needed here, because identifying the compounds to actually use just by frequency doesn't work. The list of candidate words is saved as `csv`. To use a given word, add an integer from 1 to 6 in the column `use_position`. Any necessary edits to the meaning and the reading should also be made here. When complete, save the file to a new csv, replacing `candidates` with `final` in the file name.

In [32]:
words = supplement.merge(df, on='kanji', how='left')
words['use_position'] = None
words.to_csv(f'./data/wr_old_supplements/{filename}_candidates.csv', index=False)

Load back the manually edited words, pivot them, and merge them to the supplement.

In [None]:
words_final = pd.read_csv(f'./data/wr_old_supplements/{filename}_final.csv')

words_final = words_final.loc[words_final['use_position'].between(1, 6)]
words_final = words_final[['kanji', 'main_spelling', 'main_reading', 'meaning', 'use_position']]
words_final = words_final.sort_values(['kanji', 'use_position']).copy()

words_final['use_position'] = words_final['use_position'].astype(int)

words_final['front'] = words_final.apply(lambda row: f"{row['use_position']}. {row['main_spelling']}", axis=1)
words_final['back'] = words_final.apply(
    lambda row: f"{row['use_position']}. {row['main_reading']}, {row['meaning']}", axis=1)

words_final = words_final[['kanji', 'use_position', 'front', 'back']]

words_final = pd.pivot(words_final, index='kanji', columns=['use_position']).reset_index()
words_final.columns = [f'{col[0]}_{col[1]}' for col in words_final.columns.values]
words_final = words_final.rename({'kanji_': 'kanji'}, axis=1)

words_final.head()

In [None]:
supplement = supplement.merge(words_final, on='kanji', how='left').copy()

### Add the link to the stroke order diagram
InDesign requires an `@` in front of the column name if it's an image.

In [None]:
supplement['@stroke_order_diagram'] = (
    supplement['kanji'].apply(lambda k: f'/Users/gabriele/Code/kanji/data/wr_old_supplements/stroke_order_{k}.jpg')
)

### Export the final file
InDesign requires `UTF-16` encoding for all the Japanese characters to work.

In [None]:
supplement.loc[supplement['kanji'] == '虞', 'kunyomi'] = 'おそれ'

In [None]:
supplement.head()

In [None]:
supplement.to_csv(f'./data/wr_old_supplements/{filename}_supplement.csv', index=False, encoding='UTF-16')