# Create compound flashcards

In [1]:
import datetime as dt
import hashlib
import random
import re
import string
import warnings

from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import DBSCAN
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics.pairwise import cosine_similarity

## Select flashcard set to produce based on max grade of kanji used and frequency
- General idea: start with top 7% for each grade, then move to top 15%, then to top 25%.
    - For grade 1 go directly with 25%, because there are not so many words.

In [2]:
GRADE_FREQ_SETTINGS = {
    1.0: [1, 25],
    2.0: [2, 7],
    3.0: [3, 7],
    4.0: [4, 7],
    5.0: [5, 7],
    6.0: [6, 10],
    2.1: [2, 15],
    8.1: [8.1, 10],
    8.2: [8.2, 15],
    8.3: [8.3, 25],
    8.4: [8.4, 25],
    8.31: [8.3, 12]
}

SET_NUMBER = 8.31

MAX_GRADE = GRADE_FREQ_SETTINGS[SET_NUMBER][0]
MAX_QUANTILE = GRADE_FREQ_SETTINGS[SET_NUMBER][1]

## Load data

### Word frequencies from Wikipedia (2015)

In [3]:
freqs = pd.read_csv('./data/wikipedia-20150422-lemmas.tsv', delimiter='\t',
                    header=None, names=['rank', 'frequency', 'main_spelling'])
freqs.head()

Unnamed: 0,rank,frequency,main_spelling
0,1,34943515,の
1,2,19609102,に
2,3,17575121,する
3,4,16805949,は
4,5,16443314,を


### Kanjidic

In [4]:
kanji = pd.read_parquet('./data/kanjidic.parquet')
kanji = kanji.rename({'frequency': 'kanji_frequency'}, axis=1)
kanji.sample(n=5)

Unnamed: 0,kanji,jis208,jis212,jis213,unicode,radical,radical_name,stroke_count,grade,kanji_frequency,...,skip_code,onyomi,kunyomi,nanori,meanings,n_onyomi,n_nanori,n_kunyomi,n_kunyomi_distinct,jis_level
4674,皺,1-66-18,,,76BA,107,,15,,,...,1-10-5,"[シュウ, スウ]","[しわ, しぼ]",,"[wrinkles, creases, folds]",2,0,2,2,2
9292,瞯,,1-47-24,,77AF,109,,17,,,...,1-5-12,"[カン, ケン]","[うわめづかい, ながしめ]",,"[to peep, to spy]",2,0,2,2,0
7898,搊,,1-32-40,,640A,64,,13,,,...,1-3-10,"[シュウ, シュ, ス]",,,[pluck],3,0,0,0,0
12552,䁘,,,2-82-13,4058,109,,15,,,...,1-5-10,,,,"[to look at, to see, pretty eyes, farsightedne...",0,0,0,0,0
10763,豦,,1-62-85,,8C66,152,,13,,,...,3-4-9,"[キョ, ゴ]",[やめない],,"[wild boar, to fight]",2,0,1,1,0


### Kanji by Kanken level

In [5]:
kanken = pd.read_csv('./data/kanji_by_kanken_level.csv')
kanken.sample(n=5)

Unnamed: 0,kanji,kanken_level
3112,燗,1.0
6055,馨,1.5
987,壽,1.5
212,倏,1.0
100,付,7.0


### JMdict

In [6]:
jmdict = pd.read_parquet('./data/jmdict.parquet')

jmdict = jmdict[['main_spelling', 'main_reading', 'sense_no', 'meaning', 'part_of_speech', 'remarks']]

jmdict['sense_to_remove'] = jmdict['remarks'].apply(lambda x: np.sum(
    [r in x for r in ['obscure term', 'archaism', 'dated term',
                      'obsolete term', 'historical term']]) if x else 0)


In [7]:
jmdict = jmdict.loc[jmdict['sense_to_remove'] == 0].groupby(['main_spelling', 'main_reading']).agg({
    'meaning': lambda x: list(x),
    'part_of_speech': lambda x: list(x),
    'remarks': lambda x: list(x),
    'sense_no': 'count'
}).reset_index()

In [8]:
jmdict.loc[jmdict['sense_no'] > 1].sample(n=5)

Unnamed: 0,main_spelling,main_reading,meaning,part_of_speech,remarks,sense_no
16235,両君,りょうくん,"[two people, two monarchs, two rulers]","[noun (common) (futsuumeishi), noun (common) (...","[None, None]",2
107004,窒化,ちっか,"[nitridization, nitriding, nitride (of)]",[noun (common) (futsuumeishi); noun or partici...,"[None, None]",2
94629,爆燃,ばくねん,"[deflagration, knocking (e.g. in car engines)]","[noun (common) (futsuumeishi), noun (common) (...","[None, None]",2
147054,駐在,ちゅうざい,"[residence, stay, (job) posting, being station...",[noun (common) (futsuumeishi); noun or partici...,"[None, abbreviation]",2
47896,外角,がいかく,"[external angle, exterior angle, outside corner]","[noun (common) (futsuumeishi), noun (common) (...","[None, None]",2


### Example sentences

In [9]:
sentences = pd.read_parquet('./data/example_sentences_processed.parquet')
sentences['sense'] = sentences['sense'].fillna(0).astype(int)
sentences = (
    sentences.loc[sentences['sense'].isin([0, 1]), ['jpn_sentence', 'eng_sentence', 'lemma', 'reading']]
             .rename({'lemma': 'kanji'}, axis=1)
)
sentences.sample(n=10)

Unnamed: 0,jpn_sentence,eng_sentence,kanji,reading
192082,彼らはその計画を直ちに実行に移すべきだと提案した。,They suggested that we should put the plan int...,だ,
510042,自分が生きている世界との共感がなければ、作家であることは無益である。,It is no use being a writer if one is not in s...,世界,
1089002,ガスをつけっぱなしにするなんて私も不注意でした。,It was careless of me to leave the gas on.,です,
982457,そのような境遇であったにもかかわらず、彼は自分一人で道を切り開いた。,"Although he was in such circumstances, he made...",自分,
344826,彼の議論は説得力のあるものだったが、私達は彼を支持しなかった。,"Though his argument was convincing, we did not...",支持,
611379,私の夢は野球の選手です。,My dream is to be a baseball player.,野球,
1103263,イルカは人間に次いで最も知能が高く、やがては彼らとの対話も夢ではないと考えている科学者もいる。,Some scientists think that dolphins are the mo...,と,
167854,彼女には息子がいて、その息子は医者になった。,"She has a son, who became a doctor.",其の,
177540,彼らは幼稚すぎて学ぶことの必要性を理解できなかった。,They were too naive to understand the necessit...,理解,
333586,彼の髪は金髪で、彼は若く見える。,His hair is blond and he looks young.,は,


## Get most frequent compounds from JMdict

In [10]:
df = jmdict.merge(freqs, on='main_spelling', how='inner')

In [11]:
df = df.loc[df['main_spelling'].str.len() > 1].sort_values('rank').copy()

In [12]:
rank_quantiles = list(df['rank'].quantile(np.arange(0, 1, 0.01)).values)

In [13]:
df['rank_quantile'] = pd.cut(df['rank'], [0] + rank_quantiles + [np.Inf], labels=range(0, 101))

## Extract the kanji for each compound

In [14]:
kanji_block = r'[㐀-䶵一-鿋豈-頻]'
df['kanji'] = df['main_spelling'].apply(lambda x: re.findall(kanji_block, x))

df = df.explode('kanji', ignore_index=True)

In [15]:
df.sample(n=10)

Unnamed: 0,main_spelling,main_reading,meaning,part_of_speech,remarks,sense_no,rank,frequency,rank_quantile,kanji
86427,山気,さんき,[mountain air],[noun (common) (futsuumeishi)],[None],1,360369,10,90,気
32167,一子,いっし,"[one child, only child, one of several childre...","[noun (common) (futsuumeishi), noun (common) (...","[None, None, None, None]",4,33963,826,35,子
21061,斬首,ざんしゅ,"[decapitation, decapitated head]",[noun (common) (futsuumeishi); noun or partici...,"[None, None]",2,20554,1910,23,首
49466,エボラ出血熱,エボラしゅっけつねつ,[Ebola hemorrhagic fever],[noun (common) (futsuumeishi)],[None],1,63548,265,53,血
81671,不立文字,ふりゅうもんじ,[Buddhist revelation through intuitive discern...,"[expressions (phrases, clauses, etc.)]",[yojijukugo],1,251026,19,86,字
1662,週間,しゅうかん,[week],[noun (common) (futsuumeishi)],[None],1,1544,47536,2,週
93037,不適当,ふてきとう,"[inadequacy, inappropriateness, unfitness, imp...",[adjectival nouns or quasi-adjectives (keiyodo...,[None],1,707176,3,97,当
52339,極楽浄土,ごくらくじょうど,"[Amitabha's Pure Land, Sukhavati]",[noun (common) (futsuumeishi)],[None],1,69984,219,56,浄
90767,余徳,よとく,"[influence of great virtue, influence of ances...",[noun (common) (futsuumeishi)],[None],1,534188,5,95,余
45790,変わり果てる,かわりはてる,[to be completely changed],[Ichidan verb; intransitive verb],[None],1,56047,336,49,変


## Find compounds by kanji, grade and frequency

### Augment grades with Kanken levels

In [16]:
kanji = kanji.merge(kanken, on='kanji', how='left')

In [17]:
def augment_grade(row):
    if row['grade'] == 8:
        mapping = {
            2.0: 8.4,
            2.5: 8.3,
            3.0: 8.2,
            4.0: 8.1
        }
        return mapping[row['kanken_level']]
    return row['grade']

In [18]:
kanji['grade'] = kanji['grade'].fillna(11).astype(int)
kanji['grade'] = kanji[['grade', 'kanken_level']].apply(augment_grade, axis=1)

In [19]:
kanji['grade'].value_counts().sort_index()

1.0        80
2.0       160
3.0       200
4.0       202
5.0       193
6.0       191
8.1       313
8.2       284
8.3       328
8.4       185
9.0       651
10.0      212
11.0    10109
Name: grade, dtype: int64

### Merge most frequent words to kanji

In [20]:
kanji = kanji.merge(df, on='kanji', how='inner')

In [21]:
compounds = (
    kanji.groupby(['main_spelling', 'main_reading'])
         .agg({
             'grade': 'max',
             'rank_quantile': 'first',
             'meaning': 'first',
             'sense_no': 'first',
             'part_of_speech': 'first',
             'remarks': 'first'})
         .rename({'grade': 'max_grade'}, axis=1)
         .reset_index()
)


## Create the desired set of flashcards

In [22]:
v = compounds.loc[compounds['rank_quantile'] <= MAX_QUANTILE, 'max_grade'].value_counts().sort_index()
v

1.0     234
2.0     747
3.0     943
4.0     874
5.0     950
6.0     615
8.1     604
8.2     345
8.3     239
8.4      55
9.0      60
11.0     23
Name: max_grade, dtype: int64

In [23]:
v.loc[[1, 2, 3, 4, 5, 6, 8.1, 8.2, 8.3, 8.4]].sum()

5606

In [24]:
selection = (
    compounds.loc[(compounds['max_grade'] <= MAX_GRADE)
                  & (compounds['rank_quantile'] <= MAX_QUANTILE)]
             .sort_values('rank_quantile')
             .rename({'main_spelling': 'kanji', 'main_reading': 'reading',
                      'meaning': 'raw_meanings'}, axis=1)
).copy()

In [25]:
selection.sample(frac=0.01)

Unnamed: 0,kanji,reading,max_grade,rank_quantile,raw_meanings,sense_no,part_of_speech,remarks
44691,陸地,りくち,4.0,10,[land],1,[noun (common) (futsuumeishi); nouns which may...,[None]
42583,過剰,かじょう,8.3,9,"[excess, surplus, superabundance, overabundance]",1,[noun (common) (futsuumeishi); adjectival noun...,[None]
8998,参入,さんにゅう,4.0,8,"[entering (the marketplace), introducing (some...",2,[noun (common) (futsuumeishi); noun or partici...,"[None, None]"
23350,有効,ゆうこう,5.0,3,"[valid, effective, yuko (judo)]",2,[adjectival nouns or quasi-adjectives (keiyodo...,"[None, None]"
24911,権限,けんげん,6.0,6,"[power, authority, jurisdiction]",1,[noun (common) (futsuumeishi)],[None]
5907,全長,ぜんちょう,3.0,3,"[over-all length, span]",1,[noun (common) (futsuumeishi); nouns which may...,[None]
29357,犯す,おかす,5.0,9,"[to commit (e.g. crime), to perpetrate, to mak...",3,"[Godan verb with 'su' ending; transitive verb,...","[None, None, None]"
14989,寸前,すんぜん,6.0,11,"[just before, on the verge of, on the brink of...",2,"[noun (common) (futsuumeishi); noun, used as a...","[None, None]"
21011,掲げる,かかげる,8.2,5,"[to put up (a notice, sign, etc.), to hang out...",5,"[Ichidan verb; transitive verb, Ichidan verb; ...","[None, None, None, None, None]"
2490,中国,ちゅうごく,2.0,1,"[China, Chūgoku region of western Honshu (incl...",4,"[noun (common) (futsuumeishi), noun (common) (...","[None, abbreviation, None, None]"


In [26]:
len(selection)

5551

### Add the example senteces
Note for the merging. If no reading is provided in the example sentences (`reading_y` is `None`), then a match on the kanji spelling alone is ok. However if a reading is provided in the example senteces, that usually means that some disambiguation is needed: therefore the reading in the example sentences and the reading in the dictionary has to match.

In [27]:
subset = selection.merge(sentences, on='kanji', how='left')

subset.loc[(subset['reading_y'].notnull()) & (subset['reading_x'] != subset['reading_y']), 'jpn_sentence'] = None
subset.loc[(subset['reading_y'].notnull()) & (subset['reading_x'] != subset['reading_y']), 'eng_sentence'] = None

subset = (
    subset.drop('reading_y', axis=1)
          .rename({'reading_x': 'reading'}, axis=1)
)

In [28]:
subset = (
    subset.groupby(['kanji', 'reading'])
          .agg({
              'part_of_speech': 'first',
              'raw_meanings': 'first',
              'max_grade': 'first',
              'rank_quantile': 'first',
              'remarks': 'first',
              'jpn_sentence': lambda x: list(x),
              'eng_sentence': lambda x: list(x)
          })
          .reset_index()
)

In [29]:
len(subset)

5551

In [30]:
def get_sentence_indices(x):
    if len(x) == 1:
        indices = [0, None, None]
    elif len(x) == 2:
        indices = [0, 1, None]
    elif len(x) == 3:
        indices = [0, 1, 2]
    else:
        indices = random.sample(range(len(x)), 3)
    return indices


def extract_sentence(row, sentence_col, index):
    if row['sentence_indices'][index] is not None:
        return row[sentence_col][row['sentence_indices'][index]]
    else:
        return None    

In [31]:
subset['sentence_indices'] = subset['jpn_sentence'].apply(get_sentence_indices)

subset['jpn_sentence_1'] = subset.apply(lambda row: extract_sentence(row, 'jpn_sentence', 0), axis=1)
subset['eng_sentence_1'] = subset.apply(lambda row: extract_sentence(row, 'eng_sentence', 0), axis=1)

subset['jpn_sentence_2'] = subset.apply(lambda row: extract_sentence(row, 'jpn_sentence', 1), axis=1)
subset['eng_sentence_2'] = subset.apply(lambda row: extract_sentence(row, 'eng_sentence', 1), axis=1)

subset['jpn_sentence_3'] = subset.apply(lambda row: extract_sentence(row, 'jpn_sentence', 2), axis=1)
subset['eng_sentence_3'] = subset.apply(lambda row: extract_sentence(row, 'eng_sentence', 2), axis=1)

In [32]:
subset = subset.drop(['jpn_sentence', 'eng_sentence', 'sentence_indices'], axis=1)

### Use Google News embeddings and clustering to pick out representative meanings

In [33]:
word2vec = KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [34]:
def change_commas_in_parentheses(input_string: str) -> str:
    output_string = ''
    change_comma = True
    for char in input_string:
        if char == '(':
            change_comma = False
        if char == ',' and change_comma:
            output_string += ';'
            continue
        if char == ')':
            change_comma = True
        output_string += char
    return output_string
        

In [35]:
def remove_punctuation(input_string: str) -> str:
    return input_string.lower().translate(str.maketrans('', '', string.punctuation))

In [36]:
def find_representative_meanings(subgroup):
    meanings = [meaning
                for meaning_list in subgroup['raw_meanings'].values[0]
                for meaning in change_commas_in_parentheses(meaning_list).split('; ')]

    phrases = [remove_punctuation(m) for m in meanings]
    
    split_phrases = [words.split(' ') for words in phrases]
    split_phrases = [[w for w in phrase if word2vec.has_index_for(w)]
                     for phrase in split_phrases]
    split_phrases = [phrase for phrase in split_phrases if phrase]
    
    if len(split_phrases) <= 1:
        subgroup['clusters'] = str(meanings)
        subgroup['english'] = ', '.join(meanings)
        return subgroup
    
    vectors = [np.sum([word2vec.get_vector(w) for w in words], axis=0)
               for words in split_phrases]
    
    with warnings.catch_warnings():
        warnings.simplefilter('error', ConvergenceWarning)
        warnings.simplefilter('error', UserWarning)
        
        try:
            clustering_method = 'affinity'
            clustering = (
                AffinityPropagation(affinity='precomputed', random_state=2)
                .fit(cosine_similarity(vectors))
            )
            centers = clustering.cluster_centers_indices_
        
        except (ConvergenceWarning, UserWarning):
            print(f"Affinity propagation failed for {subgroup['kanji'].values[0]}"
                  ", switching to DBSCAN")
            
            clustering_method = 'dbscan'
            clustering = DBSCAN(eps=0.65, metric='cosine').fit(vectors)
            centers = []
    
    labels = clustering.labels_
    clusters = {c: [meanings[i] for i in range(len(split_phrases)) if labels[i] == c]
                for c in range(max(labels) + 1)}
    
    subgroup['clusters'] = f'{clustering_method} - {str(clusters)}'
    if len(centers):
        subgroup['english'] = ', '.join([meanings[c] for c in centers])
    elif max(labels) > -1:
        subgroup['english'] = ', '.join([c[0] for c in clusters.values()])
    else:
        subgroup['english'] = ', '.join(subgroup['raw_meanings'].values[0])
    
    return subgroup

In [37]:
# subgroup = subset.loc[subset['kanji'] == '図る']
# subgroup

In [38]:
# find_representative_meanings(subgroup)

In [39]:
subset = subset.groupby(['kanji', 'reading']).apply(find_representative_meanings)

Affinity propagation failed for ある程度, switching to DBSCAN
Affinity propagation failed for お互い, switching to DBSCAN
Affinity propagation failed for お前, switching to DBSCAN
Affinity propagation failed for お知らせ, switching to DBSCAN
Affinity propagation failed for その間, switching to DBSCAN
Affinity propagation failed for っ子, switching to DBSCAN
Affinity propagation failed for に従い, switching to DBSCAN
Affinity propagation failed for に際して, switching to DBSCAN
Affinity propagation failed for もう一度, switching to DBSCAN
Affinity propagation failed for 一員, switching to DBSCAN
Affinity propagation failed for 一斉, switching to DBSCAN
Affinity propagation failed for 一環, switching to DBSCAN
Affinity propagation failed for 一行, switching to DBSCAN
Affinity propagation failed for 丁目, switching to DBSCAN
Affinity propagation failed for 三つ, switching to DBSCAN
Affinity propagation failed for 三国志, switching to DBSCAN
Affinity propagation failed for 三男, switching to DBSCAN
Affinity propagation failed for 三角, 

Affinity propagation failed for 出典, switching to DBSCAN
Affinity propagation failed for 出張, switching to DBSCAN
Affinity propagation failed for 出張所, switching to DBSCAN
Affinity propagation failed for 出演, switching to DBSCAN
Affinity propagation failed for 出題, switching to DBSCAN
Affinity propagation failed for 分布, switching to DBSCAN
Affinity propagation failed for 切手, switching to DBSCAN
Affinity propagation failed for 刊行, switching to DBSCAN
Affinity propagation failed for 刑事, switching to DBSCAN
Affinity propagation failed for 刑務所, switching to DBSCAN
Affinity propagation failed for 刑法, switching to DBSCAN
Affinity propagation failed for 列島, switching to DBSCAN
Affinity propagation failed for 列車, switching to DBSCAN
Affinity propagation failed for 初代, switching to DBSCAN
Affinity propagation failed for 初恋, switching to DBSCAN
Affinity propagation failed for 初日, switching to DBSCAN
Affinity propagation failed for 初等, switching to DBSCAN
Affinity propagation failed for 判事, switching 

Affinity propagation failed for 基礎, switching to DBSCAN
Affinity propagation failed for 基金, switching to DBSCAN
Affinity propagation failed for 報じる, switching to DBSCAN
Affinity propagation failed for 報告, switching to DBSCAN
Affinity propagation failed for 場合, switching to DBSCAN
Affinity propagation failed for 塗装, switching to DBSCAN
Affinity propagation failed for 墓地, switching to DBSCAN
Affinity propagation failed for 増える, switching to DBSCAN
Affinity propagation failed for 増す, switching to DBSCAN
Affinity propagation failed for 増刊, switching to DBSCAN
Affinity propagation failed for 増大, switching to DBSCAN
Affinity propagation failed for 墜落, switching to DBSCAN
Affinity propagation failed for 声明, switching to DBSCAN
Affinity propagation failed for 変動, switching to DBSCAN
Affinity propagation failed for 変異, switching to DBSCAN
Affinity propagation failed for 夏休み, switching to DBSCAN
Affinity propagation failed for 夕方, switching to DBSCAN
Affinity propagation failed for 外科, switching

Affinity propagation failed for 建て, switching to DBSCAN
Affinity propagation failed for 建てる, switching to DBSCAN
Affinity propagation failed for 建立, switching to DBSCAN
Affinity propagation failed for 建築, switching to DBSCAN
Affinity propagation failed for 建設, switching to DBSCAN
Affinity propagation failed for 建造, switching to DBSCAN
Affinity propagation failed for 弁当, switching to DBSCAN
Affinity propagation failed for 弁護士, switching to DBSCAN
Affinity propagation failed for 式典, switching to DBSCAN
Affinity propagation failed for 引き継ぐ, switching to DBSCAN
Affinity propagation failed for 弦楽, switching to DBSCAN
Affinity propagation failed for 弦楽器, switching to DBSCAN
Affinity propagation failed for 張る, switching to DBSCAN
Affinity propagation failed for 強める, switching to DBSCAN
Affinity propagation failed for 強力, switching to DBSCAN
Affinity propagation failed for 弾薬, switching to DBSCAN
Affinity propagation failed for 当地, switching to DBSCAN
Affinity propagation failed for 当時, switch

Affinity propagation failed for 書店, switching to DBSCAN
Affinity propagation failed for 書物, switching to DBSCAN
Affinity propagation failed for 書籍, switching to DBSCAN
Affinity propagation failed for 書類, switching to DBSCAN
Affinity propagation failed for 最も, switching to DBSCAN
Affinity propagation failed for 最前線, switching to DBSCAN
Affinity propagation failed for 最多, switching to DBSCAN
Affinity propagation failed for 最長, switching to DBSCAN
Affinity propagation failed for 月光, switching to DBSCAN
Affinity propagation failed for 月刊, switching to DBSCAN
Affinity propagation failed for 月日, switching to DBSCAN
Affinity propagation failed for 月間, switching to DBSCAN
Affinity propagation failed for 有名, switching to DBSCAN
Affinity propagation failed for 有罪, switching to DBSCAN
Affinity propagation failed for 有限, switching to DBSCAN
Affinity propagation failed for 服装, switching to DBSCAN
Affinity propagation failed for 朗読, switching to DBSCAN
Affinity propagation failed for 木製, switching t

Affinity propagation failed for 特例, switching to DBSCAN
Affinity propagation failed for 特有, switching to DBSCAN
Affinity propagation failed for 特権, switching to DBSCAN
Affinity propagation failed for 特番, switching to DBSCAN
Affinity propagation failed for 特設, switching to DBSCAN
Affinity propagation failed for 犯罪, switching to DBSCAN
Affinity propagation failed for 独裁, switching to DBSCAN
Affinity propagation failed for 獲得, switching to DBSCAN
Affinity propagation failed for 王位, switching to DBSCAN
Affinity propagation failed for 王国, switching to DBSCAN
Affinity propagation failed for 王子, switching to DBSCAN
Affinity propagation failed for 球場, switching to DBSCAN
Affinity propagation failed for 理事, switching to DBSCAN
Affinity propagation failed for 理想, switching to DBSCAN
Affinity propagation failed for 環境, switching to DBSCAN
Affinity propagation failed for 生体, switching to DBSCAN
Affinity propagation failed for 生前, switching to DBSCAN
Affinity propagation failed for 生息, switching to

Affinity propagation failed for 自信, switching to DBSCAN
Affinity propagation failed for 自動, switching to DBSCAN
Affinity propagation failed for 自在, switching to DBSCAN
Affinity propagation failed for 自宅, switching to DBSCAN
Affinity propagation failed for 自己, switching to DBSCAN
Affinity propagation failed for 自慢, switching to DBSCAN
Affinity propagation failed for 自治, switching to DBSCAN
Affinity propagation failed for 自由, switching to DBSCAN
Affinity propagation failed for 自由民主党, switching to DBSCAN
Affinity propagation failed for 自衛, switching to DBSCAN
Affinity propagation failed for 自身, switching to DBSCAN
Affinity propagation failed for 自転車, switching to DBSCAN
Affinity propagation failed for 興業, switching to DBSCAN
Affinity propagation failed for 舞台, switching to DBSCAN
Affinity propagation failed for 舞踊, switching to DBSCAN
Affinity propagation failed for 航空, switching to DBSCAN
Affinity propagation failed for 航続, switching to DBSCAN
Affinity propagation failed for 艦船, switchin

Affinity propagation failed for 過激, switching to DBSCAN
Affinity propagation failed for 道路, switching to DBSCAN
Affinity propagation failed for 達す, switching to DBSCAN
Affinity propagation failed for 達人, switching to DBSCAN
Affinity propagation failed for 適す, switching to DBSCAN
Affinity propagation failed for 適用, switching to DBSCAN
Affinity propagation failed for 選ぶ, switching to DBSCAN
Affinity propagation failed for 選手権, switching to DBSCAN
Affinity propagation failed for 選書, switching to DBSCAN
Affinity propagation failed for 選考, switching to DBSCAN
Affinity propagation failed for 遺伝, switching to DBSCAN
Affinity propagation failed for 遺伝子, switching to DBSCAN
Affinity propagation failed for 遺体, switching to DBSCAN
Affinity propagation failed for 遺跡, switching to DBSCAN
Affinity propagation failed for 邸宅, switching to DBSCAN
Affinity propagation failed for 部員, switching to DBSCAN
Affinity propagation failed for 部数, switching to DBSCAN
Affinity propagation failed for 都立, switching 

### Consolidate part of speech and remarks

In [40]:
subset['remarks'] = subset['remarks'].apply(lambda x: ', '.join([str(s) for s in set(x)]))

In [41]:
subset['part_of_speech'] = subset['part_of_speech'].apply(lambda x: ', '.join([str(s) for s in set(x)]))

In [42]:
subset.sample(n=5)

Unnamed: 0,kanji,reading,part_of_speech,raw_meanings,max_grade,rank_quantile,remarks,jpn_sentence_1,eng_sentence_1,jpn_sentence_2,eng_sentence_2,jpn_sentence_3,eng_sentence_3,clusters,english
5455,飾り,かざり,"noun (common) (futsuumeishi); noun, used as a ...","[decoration, ornament, trimmings]",8.1,12,,それは飾りにすぎない。,It is merely an ornament.,その箱に飾りのテープをかけてください。,Do up the box with decorative tape.,,,"affinity - {0: ['decoration', 'ornament'], 1: ...","decoration, trimmings"
5241,開始,かいし,noun (common) (futsuumeishi); noun or particip...,"[start, commencement, beginning, initiation]",3.0,1,,警察は直ちにその事件の調査を開始した。,The police began to go into the matter in no t...,政府は工業の振興計画を開始した。,The government started a program to promote in...,警察は犯人の全国的な捜索を開始した。,The police have started a nationwide hunt for ...,dbscan - {},"start, commencement, beginning, initiation"
4895,辞典,じてん,noun (common) (futsuumeishi),"[dictionary, lexicon]",4.0,3,,私はいつも辞典を３冊手元に置いています。,I always keep three dictionaries at hand.,,,,,dbscan - {},"dictionary, lexicon"
4601,親善,しんぜん,noun (common) (futsuumeishi),"[friendship, goodwill, friendly relations, amity]",6.0,12,,,,,,,,"affinity - {0: ['friendly relations'], 1: ['fr...","friendly relations, amity"
4112,立地,りっち,noun (common) (futsuumeishi); noun or particip...,"[choosing a site (e.g. for industry), deciding...",2.0,7,,,,,,,,affinity - {0: ['choosing a site (e.g. for ind...,choosing a site (e.g. for industry)


## Add hints for duplicate kanji spellings

In [43]:
def create_reading_hints(subgroup):
    if len(subgroup) == 1:
        subgroup['hint'] = None
    else:
        for i in subgroup.index:
            hint = ', '.join(['not ' + subgroup.at[j, 'reading'] for j in subgroup.index if j!=i])
            subgroup.at[i, 'hint'] = hint
    return subgroup

In [44]:
subset = subset.groupby('kanji').apply(create_reading_hints)

## Add unique id

In [45]:
def create_compound_id(row):
    key = f"{row['kanji']} [{row['reading']}]"
    return hashlib.sha1(key.encode('utf-8')).hexdigest()

In [46]:
subset['id'] = subset.apply(create_compound_id, axis=1)

## Add tags

In [47]:
subset['quantile_bin'] = pd.cut(subset['rank_quantile'],
                                bins=[-1, 7, 15, 25, 100],
                                labels=['top_7', 'top_15', 'top_25', 'rest'])

In [48]:
def process_tags(row):
    if int(row['max_grade']) == row['max_grade']:
        max_grade_tag = str(int(row['max_grade']))
    else:
        max_grade_tag = str(row['max_grade']).replace('.', '_')
    return f'max_grade_{max_grade_tag} ' + row['quantile_bin']
    
    
subset['tags'] = subset.apply(process_tags, axis=1)

## Export final results

In [49]:
subset['last_updated_on'] = str(dt.date.today())

subset = subset[['id', 'kanji', 'hint', 'reading', 'part_of_speech', 'english',
                 'remarks', 'max_grade', 'rank_quantile', 'jpn_sentence_1', 'eng_sentence_1',
                 'jpn_sentence_2', 'eng_sentence_2', 'jpn_sentence_3', 'eng_sentence_3', 'raw_meanings', 'clusters',
                 'last_updated_on', 'tags']].copy()



In [50]:
assert np.sum(subset['id'].duplicated()) == 0
assert len(selection) == len(subset)

In [51]:
subset.sample(n=5)

Unnamed: 0,id,kanji,hint,reading,part_of_speech,english,remarks,max_grade,rank_quantile,jpn_sentence_1,eng_sentence_1,jpn_sentence_2,eng_sentence_2,jpn_sentence_3,eng_sentence_3,raw_meanings,clusters,last_updated_on,tags
1583,52bf5004c1b71590fc955023536c079fe1806c8f,売れる,,うれる,Ichidan verb; intransitive verb,"to sell (well), to be popular",,2.0,10,傘が良く売れる。,Umbrellas sell well.,この本は日本でよく売れた。,This book sold well in Japan.,若者を対象にした本が近頃良く売れる。,Books for young people sell well these days.,"[to sell (well), to be well known, to be popul...","affinity - {0: ['to sell (well)'], 1: ['to be ...",2021-06-27,max_grade_2 top_15
1292,6da2d5fd4bf0a68ebade24c8bd4a06766baeb8ca,同一,,どういつ,nouns which may take the genitive case particl...,"same, equal, without discrimination",,2.0,4,その結果、同一労働に対し同一賃金を得ている女性が増えつつある。,"As a result, more women are receiving equal work.",その結果、同一労働に対し同一賃金を得ている女性が増えつつある。,"As a result, more women are receiving equal work.",これは私が先日なくしたのと同一の鉛筆である。,This is the same pencil that I lost the other ...,"[identical, same, one and the same, equal, fai...","affinity - {0: ['identical', 'same', 'one and ...",2021-06-27,max_grade_2 top_7
4982,204e979e08cd332d9d559981af9e4db46ee504f9,通貨,,つうか,noun (common) (futsuumeishi),currency,,4.0,6,ある通貨の価値が下がると、その国の経済にインフレの影響をもたらす。,"When a currency depreciates, that has an infla...",欧州通貨はドルに対して弱くなった。,European currencies weakened against the dollar.,ドルは日本の通貨に対し１ドル３６０円から３０８円に切り下げられた。,The dollar was devalued against the Japanese c...,[currency],['currency'],2021-06-27,max_grade_4 top_7
203,822a9699999908e985e14e27a6ed536a4114e0e6,中学校,,ちゅうがっこう,noun (common) (futsuumeishi),"junior high school, lower secondary school",,1.0,1,この歌を聞くと私の中学校時代を思い出します。,This song reminds me of my junior high school ...,彼は１２歳の時、つまり中学校に入学した時、英語を習いはじめた。,He began to learn English when he was twelve y...,彼は中学校に入った。,He entered junior high school.,"[junior high school, middle school, lower seco...","affinity - {0: ['junior high school', 'middle ...",2021-06-27,max_grade_1 top_7
3269,9a092ca6c40c9cd500db7a7e55bf6f90d7a2c405,歌人,not うたびと,かじん,noun (common) (futsuumeishi),poet (of tanka poems),,2.0,10,,,,,,,[poet (of tanka poems)],['poet (of tanka poems)'],2021-06-27,max_grade_2 top_15


In [52]:
len(subset)

5551

In [53]:
subset.loc[subset['kanji'] == '保険', 'english'].values

array(['insurance, guarantee'], dtype=object)

In [54]:
subset.loc[subset['kanji'] == '保険', 'clusters'].values

array(['dbscan - {}'], dtype=object)

In [55]:
file_name = './data/compound_flashcards_lt{}_top_{}_pct.csv'.format(MAX_GRADE, MAX_QUANTILE)
subset.to_csv(file_name, index=False, header=False)

file_name

'./data/compound_flashcards_lt8.3_top_12_pct.csv'