# Create compound flashcards

In [1]:
import datetime as dt
import hashlib
import random
import re
import string
import warnings

from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import DBSCAN
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
DATA_PATH = '/Users/glillacci/OneDrive - Tesco/Personal/Data/kanjidata'

## Select flashcard set to produce based on max grade of kanji used and frequency
- General idea: start with top 7% for each grade, then move to top 15%, then to top 25%.
    - For grade 1 go directly with 25%, because there are not so many words.

In [3]:
GRADE_FREQ_SETTINGS = {
    1.0: [1, 25],
    2.0: [2, 7],
    3.0: [3, 7],
    4.0: [4, 7],
    5.0: [5, 7],
    6.0: [6, 10],
    2.1: [2, 15],
    8.1: [8.1, 10],
    8.2: [8.2, 15],
    8.3: [8.3, 25],
    8.4: [8.4, 25],
    8.31: [8.3, 12],
    8.41: [8.4, 15]
}

SET_NUMBER = 8.41

MAX_GRADE = GRADE_FREQ_SETTINGS[SET_NUMBER][0]
MAX_QUANTILE = GRADE_FREQ_SETTINGS[SET_NUMBER][1]

## Load data

### Word frequencies from Wikipedia (2015)

In [4]:
freqs = pd.read_csv(f'{DATA_PATH}/wikipedia-20150422-lemmas.tsv', delimiter='\t',
                    header=None, names=['rank', 'frequency', 'main_spelling'])
freqs.head()

Unnamed: 0,rank,frequency,main_spelling
0,1,34943515,の
1,2,19609102,に
2,3,17575121,する
3,4,16805949,は
4,5,16443314,を


### Kanjidic

In [5]:
kanji = pd.read_parquet(f'{DATA_PATH}/kanjidic.parquet')
kanji = kanji.rename({'frequency': 'kanji_frequency'}, axis=1)
kanji.sample(n=5)

Unnamed: 0,kanji,jis208,jis212,jis213,unicode,radical,radical_name,stroke_count,grade,kanji_frequency,...,skip_code,onyomi,kunyomi,nanori,meanings,n_onyomi,n_nanori,n_kunyomi,n_kunyomi_distinct,jis_level
10337,虣,,1-58-35,,8663,141,,16,,,...,1-8-8,[ホウ],[しいたげる],,"[cruel, violent, passionate]",1,0,1,1,0
7487,帇,,1-28-5,,5E07,50,,6,,,...,4-6-3,"[デン, ニョウ, ロウ]",[てわざがすばやい],,,3,0,1,1,0
287,界,1-19-6,,,754C,102,,9,3.0,,...,2-5-4,[カイ],,,"[world, boundary]",1,0,0,0,1
1057,燦,1-27-24,,,71E6,86,,17,9.0,,...,1-4-13,[サン],"[さん.たる, あき.らか, きらめ.く, きら.めく]",,[brilliant],1,0,4,4,1
622,緊,1-22-59,,,7DCA,120,,15,8.0,,...,2-9-6,[キン],"[し.める, し.まる]",,"[tense, solid, hard, reliable, tight]",1,0,2,1,1


### Kanji by Kanken level

In [6]:
kanken = pd.read_csv(f'{DATA_PATH}/kanji_by_kanken_level.csv')
kanken.sample(n=5)

Unnamed: 0,kanji,kanken_level
5607,鉄,8.0
147,作,9.0
2493,楼,3.0
569,厚,6.0
639,听,1.0


### JMdict

In [7]:
jmdict = pd.read_parquet(f'{DATA_PATH}/jmdict.parquet')

jmdict = jmdict[['main_spelling', 'main_reading', 'sense_no', 'meaning', 'part_of_speech', 'remarks']]

jmdict['sense_to_remove'] = jmdict['remarks'].apply(lambda x: np.sum(
    [r in x for r in ['obscure term', 'archaism', 'dated term',
                      'obsolete term', 'historical term']]) if x else 0)


In [8]:
jmdict = jmdict.loc[jmdict['sense_to_remove'] == 0].groupby(['main_spelling', 'main_reading']).agg({
    'meaning': lambda x: list(x),
    'part_of_speech': lambda x: list(x),
    'remarks': lambda x: list(x),
    'sense_no': 'count'
}).reset_index()

In [9]:
jmdict.loc[jmdict['sense_no'] > 1].sample(n=5)

Unnamed: 0,main_spelling,main_reading,meaning,part_of_speech,remarks,sense_no
47965,外部結合,がいぶけつごう,"[outer join (e.g. in relational database), lef...","[noun (common) (futsuumeishi), noun (common) (...","[None, None]",2
90391,済ます,すます,"[to finish, to get it over with, to conclude, ...","[Godan verb with 'su' ending; transitive verb,...","[None, None, None]",3
33854,力作,りきさく,"[painstaking piece of work, work of great effo...","[noun (common) (futsuumeishi), noun (common) (...","[None, formal or literary term]",2
137105,重み,おもみ,"[weight, weight (e.g. of someone's words), bur...","[noun (common) (futsuumeishi), noun (common) (...","[None, None, None]",3
66148,情報,じょうほう,"[information, news, intelligence, advices, inf...","[noun (common) (futsuumeishi), noun (common) (...","[None, None]",2


### Example sentences

In [10]:
sentences = pd.read_parquet(f'{DATA_PATH}/example_sentences_processed.parquet')
sentences['sense'] = sentences['sense'].fillna(0).astype(int)
sentences = (
    sentences.loc[sentences['sense'].isin([0, 1]), ['jpn_sentence', 'eng_sentence', 'lemma', 'reading']]
             .rename({'lemma': 'kanji'}, axis=1)
)
sentences.sample(n=10)

Unnamed: 0,jpn_sentence,eng_sentence,kanji,reading
503939,社会は個人からなりたっている。,Society is composed of individuals.,成り立つ,
992503,そう思いますよ。,I suppose so.,思う,
570091,私は君が今もっている辞書と同じ辞書がほしい。,I want the same dictionary as you have.,同じ,
89031,父はめったにたばこを吸わない。,My father seldom smokes.,父,
573158,私は学科では美術と音楽が好きです。,I like art and music among my school subjects.,美術,
612524,私の部屋は南向きなので、冬でもそんなに寒くない。,"As my room faces south, it is not so cold even...",私,わたし
608261,私は鳩たちがその箱の上によじ登った褒美に餌を与えた。,I rewarded my pigeons with food for climbing o...,上,うえ
863616,ブルーのコートを着ている女の子は私の娘です。,The girl in a blue coat is my daughter.,娘,むすめ
296014,彼はとても不注意なので間違った列車に乗ってしまった。,He was careless enough to get on the wrong train.,不注意,
848595,メアリーがピアノを弾いているのが見えます。,I see Mary playing the piano.,弾く,ひく


## Get most frequent compounds from JMdict

In [11]:
df = jmdict.merge(freqs, on='main_spelling', how='inner')

In [12]:
df = df.loc[df['main_spelling'].str.len() > 1].sort_values('rank').copy()

In [13]:
rank_quantiles = list(df['rank'].quantile(np.arange(0, 1, 0.01)).values)

In [14]:
df['rank_quantile'] = pd.cut(df['rank'], [0] + rank_quantiles + [np.Inf], labels=range(0, 101))

## Extract the kanji for each compound

In [15]:
kanji_block = r'[㐀-䶵一-鿋豈-頻]'
df['kanji'] = df['main_spelling'].apply(lambda x: re.findall(kanji_block, x))

df = df.explode('kanji', ignore_index=True)

In [16]:
df.sample(n=10)

Unnamed: 0,main_spelling,main_reading,meaning,part_of_speech,remarks,sense_no,rank,frequency,rank_quantile,kanji
71709,溺水,できすい,[drowning],[noun (common) (futsuumeishi)],[None],1,146359,51,76,水
88673,慨然,がいぜん,"[indignant, sorrowful, steadfast, determined]",['taru' adjective; adverb taking the 'to' part...,"[None, None]",2,441367,7,93,然
71754,御都合主義,ごつごうしゅぎ,"[opportunism, double standards, timeserving, e...",[noun (common) (futsuumeishi)],[None],1,146474,51,76,主
57516,痴人,ちじん,"[dunce, fool, idiot]",[noun (common) (futsuumeishi)],[None],1,83588,154,61,人
65231,物悲しい,ものがなしい,"[sad, melancholy]",[adjective (keiyoushi)],[None],1,111350,87,69,物
84937,磨き粉,みがきこ,[polishing powder],[noun (common) (futsuumeishi)],[None],1,323749,12,89,粉
17853,粘着,ねんちゃく,"[cohesion, adhesion]",[noun (common) (futsuumeishi); noun or partici...,[None],1,17088,2534,20,着
90617,大乗的,だいじょうてき,"[broad, great]",[adjectival nouns or quasi-adjectives (keiyodo...,[None],1,532932,5,95,大
42488,申し分,もうしぶん,"[objection, shortcomings]",[noun (common) (futsuumeishi); noun or partici...,[None],1,49879,416,46,分
33783,典侍,てんじ,"[maid of honor, maid of honour, lady-in-waiting]",[noun (common) (futsuumeishi)],[None],1,36138,736,37,侍


## Find compounds by kanji, grade and frequency

### Augment grades with Kanken levels

In [17]:
kanji = kanji.merge(kanken, on='kanji', how='left')

In [18]:
def augment_grade(row):
    if row['grade'] == 8:
        mapping = {
            2.0: 8.4,
            2.5: 8.3,
            3.0: 8.2,
            4.0: 8.1
        }
        return mapping[row['kanken_level']]
    return row['grade']

In [19]:
kanji['grade'] = kanji['grade'].fillna(11).astype(int)
kanji['grade'] = kanji[['grade', 'kanken_level']].apply(augment_grade, axis=1)

In [20]:
kanji['grade'].value_counts().sort_index()

1.0        80
2.0       160
3.0       200
4.0       202
5.0       193
6.0       191
8.1       313
8.2       284
8.3       328
8.4       185
9.0       651
10.0      212
11.0    10109
Name: grade, dtype: int64

### Merge most frequent words to kanji

In [21]:
kanji = kanji.merge(df, on='kanji', how='inner')

In [22]:
compounds = (
    kanji.groupby(['main_spelling', 'main_reading'])
         .agg({
             'grade': 'max',
             'rank_quantile': 'first',
             'meaning': 'first',
             'sense_no': 'first',
             'part_of_speech': 'first',
             'remarks': 'first'})
         .rename({'grade': 'max_grade'}, axis=1)
         .reset_index()
)


## Create the desired set of flashcards

In [23]:
v = compounds.loc[compounds['rank_quantile'] <= MAX_QUANTILE, 'max_grade'].value_counts().sort_index()
v

1.0      277
2.0      896
3.0     1171
4.0     1055
5.0     1145
6.0      778
8.1      814
8.2      465
8.3      319
8.4       69
9.0       88
11.0      34
Name: max_grade, dtype: int64

In [24]:
v.loc[[1, 2, 3, 4, 5, 6, 8.1, 8.2, 8.3, 8.4]].sum()

6989

In [25]:
selection = (
    compounds.loc[(compounds['max_grade'] <= MAX_GRADE)
                  & (compounds['rank_quantile'] <= MAX_QUANTILE)]
             .sort_values('rank_quantile')
             .rename({'main_spelling': 'kanji', 'main_reading': 'reading',
                      'meaning': 'raw_meanings'}, axis=1)
).copy()

In [26]:
selection.sample(frac=0.01)

Unnamed: 0,kanji,reading,max_grade,rank_quantile,raw_meanings,sense_no,part_of_speech,remarks
17629,形状,けいじょう,5.0,3,"[shape, form]",1,[noun (common) (futsuumeishi)],[None]
19257,感謝,かんしゃ,5.0,6,"[thanks, gratitude]",1,[noun (common) (futsuumeishi); noun or partici...,[None]
11715,城壁,じょうへき,8.1,15,"[rampart, castle walls]",1,[noun (common) (futsuumeishi); nouns which may...,[None]
16704,年度,ねんど,3.0,1,[fiscal year (usu. April 1 to March 31 in Japa...,3,"[noun (common) (futsuumeishi); noun, used as a...","[None, None, None]"
20401,拍子,ひょうし,8.1,11,"[(musical) time, tempo, beat, rhythm, the mome...",2,"[noun (common) (futsuumeishi), noun (common) (...","[None, None]"
...,...,...,...,...,...,...,...,...
19117,意義,いぎ,5.0,10,"[meaning, significance]",1,[noun (common) (futsuumeishi)],[None]
36830,艦載,かんさい,8.3,12,[carrying aboard a warship],1,[noun (common) (futsuumeishi); noun or partici...,[None]
17722,往来,おうらい,5.0,14,"[coming and going, traffic, road, street, asso...",5,[noun (common) (futsuumeishi); noun or partici...,"[None, None, None, None, None]"
42396,進級,しんきゅう,3.0,15,"[promotion (school, military, etc.)]",1,[noun (common) (futsuumeishi); noun or partici...,[None]


In [27]:
len(selection)

6989

### Add the example senteces
Note for the merging. If no reading is provided in the example sentences (`reading_y` is `None`), then a match on the kanji spelling alone is ok. However if a reading is provided in the example senteces, that usually means that some disambiguation is needed: therefore the reading in the example sentences and the reading in the dictionary has to match.

In [28]:
subset = selection.merge(sentences, on='kanji', how='left')

subset.loc[(subset['reading_y'].notnull()) & (subset['reading_x'] != subset['reading_y']), 'jpn_sentence'] = None
subset.loc[(subset['reading_y'].notnull()) & (subset['reading_x'] != subset['reading_y']), 'eng_sentence'] = None

subset = (
    subset.drop('reading_y', axis=1)
          .rename({'reading_x': 'reading'}, axis=1)
)

In [29]:
subset = (
    subset.groupby(['kanji', 'reading'])
          .agg({
              'part_of_speech': 'first',
              'raw_meanings': 'first',
              'max_grade': 'first',
              'rank_quantile': 'first',
              'remarks': 'first',
              'jpn_sentence': lambda x: list(x),
              'eng_sentence': lambda x: list(x)
          })
          .reset_index()
)

In [30]:
len(subset)

6989

In [31]:
def get_sentence_indices(x):
    if len(x) == 1:
        indices = [0, None, None]
    elif len(x) == 2:
        indices = [0, 1, None]
    elif len(x) == 3:
        indices = [0, 1, 2]
    else:
        indices = random.sample(range(len(x)), 3)
    return indices


def extract_sentence(row, sentence_col, index):
    if row['sentence_indices'][index] is not None:
        return row[sentence_col][row['sentence_indices'][index]]
    else:
        return None    

In [32]:
subset['sentence_indices'] = subset['jpn_sentence'].apply(get_sentence_indices)

subset['jpn_sentence_1'] = subset.apply(lambda row: extract_sentence(row, 'jpn_sentence', 0), axis=1)
subset['eng_sentence_1'] = subset.apply(lambda row: extract_sentence(row, 'eng_sentence', 0), axis=1)

subset['jpn_sentence_2'] = subset.apply(lambda row: extract_sentence(row, 'jpn_sentence', 1), axis=1)
subset['eng_sentence_2'] = subset.apply(lambda row: extract_sentence(row, 'eng_sentence', 1), axis=1)

subset['jpn_sentence_3'] = subset.apply(lambda row: extract_sentence(row, 'jpn_sentence', 2), axis=1)
subset['eng_sentence_3'] = subset.apply(lambda row: extract_sentence(row, 'eng_sentence', 2), axis=1)

In [33]:
subset = subset.drop(['jpn_sentence', 'eng_sentence', 'sentence_indices'], axis=1)

### Use Google News embeddings and clustering to pick out representative meanings

In [34]:
word2vec = KeyedVectors.load_word2vec_format(f'{DATA_PATH}/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [35]:
def change_commas_in_parentheses(input_string: str) -> str:
    output_string = ''
    change_comma = True
    for char in input_string:
        if char == '(':
            change_comma = False
        if char == ',' and change_comma:
            output_string += ';'
            continue
        if char == ')':
            change_comma = True
        output_string += char
    return output_string
        

In [36]:
def remove_punctuation(input_string: str) -> str:
    return input_string.lower().translate(str.maketrans('', '', string.punctuation))

In [37]:
def find_representative_meanings(subgroup):
    meanings = [meaning
                for meaning_list in subgroup['raw_meanings'].values[0]
                for meaning in change_commas_in_parentheses(meaning_list).split('; ')]

    phrases = [remove_punctuation(m) for m in meanings]
    
    split_phrases = [words.split(' ') for words in phrases]
    split_phrases = [[w for w in phrase if word2vec.has_index_for(w)]
                     for phrase in split_phrases]
    split_phrases = [phrase for phrase in split_phrases if phrase]
    
    if len(split_phrases) <= 1:
        subgroup['clusters'] = str(meanings)
        subgroup['english'] = ', '.join(meanings)
        return subgroup
    
    vectors = [np.sum([word2vec.get_vector(w) for w in words], axis=0)
               for words in split_phrases]
    
    with warnings.catch_warnings():
        warnings.simplefilter('error', ConvergenceWarning)
        warnings.simplefilter('error', UserWarning)
        
        try:
            clustering_method = 'affinity'
            clustering = (
                AffinityPropagation(affinity='precomputed', random_state=2)
                .fit(cosine_similarity(vectors))
            )
            centers = clustering.cluster_centers_indices_
        
        except (ConvergenceWarning, UserWarning):
            print(f"Affinity propagation failed for {subgroup['kanji'].values[0]}"
                  ", switching to DBSCAN")
            
            clustering_method = 'dbscan'
            clustering = DBSCAN(eps=0.65, metric='cosine').fit(vectors)
            centers = []
    
    labels = clustering.labels_
    clusters = {c: [meanings[i] for i in range(len(split_phrases)) if labels[i] == c]
                for c in range(max(labels) + 1)}
    
    subgroup['clusters'] = f'{clustering_method} - {str(clusters)}'
    if len(centers):
        subgroup['english'] = ', '.join([meanings[c] for c in centers])
    elif max(labels) > -1:
        subgroup['english'] = ', '.join([c[0] for c in clusters.values()])
    else:
        subgroup['english'] = ', '.join(subgroup['raw_meanings'].values[0])
    
    return subgroup

In [38]:
# subgroup = subset.loc[subset['kanji'] == '図る']
# subgroup

In [39]:
# find_representative_meanings(subgroup)

In [40]:
subset = subset.groupby(['kanji', 'reading']).apply(find_representative_meanings)

Affinity propagation failed for ある程度, switching to DBSCAN
Affinity propagation failed for お互い, switching to DBSCAN
Affinity propagation failed for お前, switching to DBSCAN
Affinity propagation failed for お知らせ, switching to DBSCAN
Affinity propagation failed for その間, switching to DBSCAN
Affinity propagation failed for っ子, switching to DBSCAN
Affinity propagation failed for に当たって, switching to DBSCAN
Affinity propagation failed for に従い, switching to DBSCAN
Affinity propagation failed for に際して, switching to DBSCAN
Affinity propagation failed for もう一度, switching to DBSCAN
Affinity propagation failed for を以て, switching to DBSCAN
Affinity propagation failed for アメリカ大陸, switching to DBSCAN
Affinity propagation failed for 一味, switching to DBSCAN
Affinity propagation failed for 一員, switching to DBSCAN
Affinity propagation failed for 一大, switching to DBSCAN
Affinity propagation failed for 一斉, switching to DBSCAN
Affinity propagation failed for 一樹, switching to DBSCAN
Affinity propagation failed f

Affinity propagation failed for 全日本, switching to DBSCAN
Affinity propagation failed for 全曲, switching to DBSCAN
Affinity propagation failed for 全権, switching to DBSCAN
Affinity propagation failed for 全編, switching to DBSCAN
Affinity propagation failed for 全長, switching to DBSCAN
Affinity propagation failed for 全面, switching to DBSCAN
Affinity propagation failed for 全高, switching to DBSCAN
Affinity propagation failed for 公共, switching to DBSCAN
Affinity propagation failed for 公司, switching to DBSCAN
Affinity propagation failed for 公団, switching to DBSCAN
Affinity propagation failed for 公安, switching to DBSCAN
Affinity propagation failed for 公文書, switching to DBSCAN
Affinity propagation failed for 公演, switching to DBSCAN
Affinity propagation failed for 公爵, switching to DBSCAN
Affinity propagation failed for 公的, switching to DBSCAN
Affinity propagation failed for 公示, switching to DBSCAN
Affinity propagation failed for 公衆, switching to DBSCAN
Affinity propagation failed for 公表, switching 

Affinity propagation failed for 同行, switching to DBSCAN
Affinity propagation failed for 名人, switching to DBSCAN
Affinity propagation failed for 名称, switching to DBSCAN
Affinity propagation failed for 名跡, switching to DBSCAN
Affinity propagation failed for 名鑑, switching to DBSCAN
Affinity propagation failed for 向ける, switching to DBSCAN
Affinity propagation failed for 吸血鬼, switching to DBSCAN
Affinity propagation failed for 吹奏楽, switching to DBSCAN
Affinity propagation failed for 吹雪, switching to DBSCAN
Affinity propagation failed for 告知, switching to DBSCAN
Affinity propagation failed for 告示, switching to DBSCAN
Affinity propagation failed for 周期, switching to DBSCAN
Affinity propagation failed for 命中, switching to DBSCAN
Affinity propagation failed for 命名, switching to DBSCAN
Affinity propagation failed for 和書, switching to DBSCAN
Affinity propagation failed for 哺乳類, switching to DBSCAN
Affinity propagation failed for 商会, switching to DBSCAN
Affinity propagation failed for 商号, switchin

Affinity propagation failed for 学芸, switching to DBSCAN
Affinity propagation failed for 学部, switching to DBSCAN
Affinity propagation failed for 学院, switching to DBSCAN
Affinity propagation failed for 宅地, switching to DBSCAN
Affinity propagation failed for 安全, switching to DBSCAN
Affinity propagation failed for 安心, switching to DBSCAN
Affinity propagation failed for 安置, switching to DBSCAN
Affinity propagation failed for 完了, switching to DBSCAN
Affinity propagation failed for 完全, switching to DBSCAN
Affinity propagation failed for 完結, switching to DBSCAN
Affinity propagation failed for 宗家, switching to DBSCAN
Affinity propagation failed for 官位, switching to DBSCAN
Affinity propagation failed for 官報, switching to DBSCAN
Affinity propagation failed for 定か, switching to DBSCAN
Affinity propagation failed for 定住, switching to DBSCAN
Affinity propagation failed for 定時, switching to DBSCAN
Affinity propagation failed for 定理, switching to DBSCAN
Affinity propagation failed for 実子, switching to

Affinity propagation failed for 強行, switching to DBSCAN
Affinity propagation failed for 弾薬, switching to DBSCAN
Affinity propagation failed for 当地, switching to DBSCAN
Affinity propagation failed for 当時, switching to DBSCAN
Affinity propagation failed for 形状, switching to DBSCAN
Affinity propagation failed for 役所, switching to DBSCAN
Affinity propagation failed for 彼ら, switching to DBSCAN
Affinity propagation failed for 待つ, switching to DBSCAN
Affinity propagation failed for 後半, switching to DBSCAN
Affinity propagation failed for 後年, switching to DBSCAN
Affinity propagation failed for 後援, switching to DBSCAN
Affinity propagation failed for 後継, switching to DBSCAN
Affinity propagation failed for 後述, switching to DBSCAN
Affinity propagation failed for 後部, switching to DBSCAN
Affinity propagation failed for 徒歩, switching to DBSCAN
Affinity propagation failed for 従属, switching to DBSCAN
Affinity propagation failed for 得票, switching to DBSCAN
Affinity propagation failed for 御殿, switching to

Affinity propagation failed for 明日, switching to DBSCAN
Affinity propagation failed for 明記, switching to DBSCAN
Affinity propagation failed for 映画, switching to DBSCAN
Affinity propagation failed for 春日, switching to DBSCAN
Affinity propagation failed for 時には, switching to DBSCAN
Affinity propagation failed for 時事, switching to DBSCAN
Affinity propagation failed for 時点, switching to DBSCAN
Affinity propagation failed for 晩年, switching to DBSCAN
Affinity propagation failed for 暴力, switching to DBSCAN
Affinity propagation failed for 曲名, switching to DBSCAN
Affinity propagation failed for 書店, switching to DBSCAN
Affinity propagation failed for 書物, switching to DBSCAN
Affinity propagation failed for 書籍, switching to DBSCAN
Affinity propagation failed for 書類, switching to DBSCAN
Affinity propagation failed for 最も, switching to DBSCAN
Affinity propagation failed for 最前線, switching to DBSCAN
Affinity propagation failed for 最多, switching to DBSCAN
Affinity propagation failed for 最寄り, switching

Affinity propagation failed for 派閥, switching to DBSCAN
Affinity propagation failed for 流入, switching to DBSCAN
Affinity propagation failed for 流星, switching to DBSCAN
Affinity propagation failed for 浅見, switching to DBSCAN
Affinity propagation failed for 海上, switching to DBSCAN
Affinity propagation failed for 海兵, switching to DBSCAN
Affinity propagation failed for 海岸, switching to DBSCAN
Affinity propagation failed for 海峡, switching to DBSCAN
Affinity propagation failed for 海水, switching to DBSCAN
Affinity propagation failed for 海運, switching to DBSCAN
Affinity propagation failed for 海面, switching to DBSCAN
Affinity propagation failed for 消火, switching to DBSCAN
Affinity propagation failed for 消費, switching to DBSCAN
Affinity propagation failed for 淡路, switching to DBSCAN
Affinity propagation failed for 混血, switching to DBSCAN
Affinity propagation failed for 添加, switching to DBSCAN
Affinity propagation failed for 清水, switching to DBSCAN
Affinity propagation failed for 測量, switching to

Affinity propagation failed for 破産, switching to DBSCAN
Affinity propagation failed for 硫酸, switching to DBSCAN
Affinity propagation failed for 硬貨, switching to DBSCAN
Affinity propagation failed for 確立, switching to DBSCAN
Affinity propagation failed for 社名, switching to DBSCAN
Affinity propagation failed for 社団, switching to DBSCAN
Affinity propagation failed for 祈り, switching to DBSCAN
Affinity propagation failed for 祈る, switching to DBSCAN
Affinity propagation failed for 祈願, switching to DBSCAN
Affinity propagation failed for 祝日, switching to DBSCAN
Affinity propagation failed for 神宮, switching to DBSCAN
Affinity propagation failed for 神話, switching to DBSCAN
Affinity propagation failed for 福音, switching to DBSCAN
Affinity propagation failed for 私的, switching to DBSCAN
Affinity propagation failed for 秋季, switching to DBSCAN
Affinity propagation failed for 秋月, switching to DBSCAN
Affinity propagation failed for 秘める, switching to DBSCAN
Affinity propagation failed for 秘宝, switching t

Affinity propagation failed for 表す, switching to DBSCAN
Affinity propagation failed for 表彰台, switching to DBSCAN
Affinity propagation failed for 被告, switching to DBSCAN
Affinity propagation failed for 被子植物, switching to DBSCAN
Affinity propagation failed for 被災, switching to DBSCAN
Affinity propagation failed for 裁判, switching to DBSCAN
Affinity propagation failed for 裁判所, switching to DBSCAN
Affinity propagation failed for 装飾, switching to DBSCAN
Affinity propagation failed for 補償, switching to DBSCAN
Affinity propagation failed for 補強, switching to DBSCAN
Affinity propagation failed for 補足, switching to DBSCAN
Affinity propagation failed for 製作, switching to DBSCAN
Affinity propagation failed for 製薬, switching to DBSCAN
Affinity propagation failed for 製造, switching to DBSCAN
Affinity propagation failed for 複線, switching to DBSCAN
Affinity propagation failed for 褐色, switching to DBSCAN
Affinity propagation failed for 西ドイツ, switching to DBSCAN
Affinity propagation failed for 西側, switch

Affinity propagation failed for 遂行, switching to DBSCAN
Affinity propagation failed for 遅延, switching to DBSCAN
Affinity propagation failed for 遊び, switching to DBSCAN
Affinity propagation failed for 遊園, switching to DBSCAN
Affinity propagation failed for 遊撃手, switching to DBSCAN
Affinity propagation failed for 運動会, switching to DBSCAN
Affinity propagation failed for 運河, switching to DBSCAN
Affinity propagation failed for 過激, switching to DBSCAN
Affinity propagation failed for 道路, switching to DBSCAN
Affinity propagation failed for 達す, switching to DBSCAN
Affinity propagation failed for 達人, switching to DBSCAN
Affinity propagation failed for 遠距離, switching to DBSCAN
Affinity propagation failed for 適す, switching to DBSCAN
Affinity propagation failed for 適正, switching to DBSCAN
Affinity propagation failed for 適用, switching to DBSCAN
Affinity propagation failed for 選ぶ, switching to DBSCAN
Affinity propagation failed for 選手権, switching to DBSCAN
Affinity propagation failed for 選曲, switchin

Affinity propagation failed for 鳥居, switching to DBSCAN
Affinity propagation failed for 鹿毛, switching to DBSCAN
Affinity propagation failed for 黄色, switching to DBSCAN
Affinity propagation failed for 黙示録, switching to DBSCAN


### Consolidate part of speech and remarks

In [41]:
subset['remarks'] = subset['remarks'].apply(lambda x: ', '.join([str(s) for s in set(x)]))

In [42]:
subset['part_of_speech'] = subset['part_of_speech'].apply(lambda x: ', '.join([str(s) for s in set(x)]))

In [43]:
subset.sample(n=5)

Unnamed: 0,kanji,reading,part_of_speech,raw_meanings,max_grade,rank_quantile,remarks,jpn_sentence_1,eng_sentence_1,jpn_sentence_2,eng_sentence_2,jpn_sentence_3,eng_sentence_3,clusters,english
2802,引き換え,ひきかえ,noun (common) (futsuumeishi),"[exchange, conversion]",8.2,15,,彼はケーキ１切れと引き換えにオレンジ１個を私にくれた。,He gave me an orange in exchange for a piece o...,彼はお金と引換に切符を受け取った。,He received a ticket in return for the money.,,,dbscan - {},"exchange, conversion"
6303,連携,れんけい,noun (common) (futsuumeishi); noun or particip...,"[cooperation, coordination, link]",8.2,4,,,,,,,,"affinity - {0: ['cooperation', 'coordination']...","cooperation, link"
2871,役所,やくどころ,noun (common) (futsuumeishi),"[assigned role, duty, position, suitable role,...",3.0,10,,,,,,,,"affinity - {0: ['duty'], 1: ['position', 'best...","duty, position, suitable role"
2678,幕府,ばくふ,noun (common) (futsuumeishi),"[bakufu, shogunate]",6.0,2,,徳川幕府は１８６８年に終わりを告げた。,The Tokugawa Shogunate came to an end in 1868.,,,,,"['bakufu', 'shogunate']","bakufu, shogunate"
1510,古今,ここん,noun (common) (futsuumeishi); nouns which may ...,"[ancient and modern times, all ages, past and ...",2.0,11,,古今を通して彼は又とない勇敢な男だ。,He is as brave a man as ever breathed.,彼は古今最上の詩人だ。,He is as great a poet as ever lived.,古今無双の勇士だ。,He is the bravest soldier that ever lived.,"affinity - {0: ['ancient and modern times', 'p...","ancient and modern times, all ages"


## Add hints for duplicate kanji spellings

In [44]:
def create_reading_hints(subgroup):
    if len(subgroup) == 1:
        subgroup['hint'] = None
    else:
        for i in subgroup.index:
            hint = ', '.join(['not ' + subgroup.at[j, 'reading'] for j in subgroup.index if j!=i])
            subgroup.at[i, 'hint'] = hint
    return subgroup

In [45]:
subset = subset.groupby('kanji').apply(create_reading_hints)

## Add unique id

In [46]:
def create_compound_id(row):
    key = f"{row['kanji']} [{row['reading']}]"
    return hashlib.sha1(key.encode('utf-8')).hexdigest()

In [47]:
subset['id'] = subset.apply(create_compound_id, axis=1)

## Add tags

In [48]:
subset['quantile_bin'] = pd.cut(subset['rank_quantile'],
                                bins=[-1, 7, 15, 25, 100],
                                labels=['top_7', 'top_15', 'top_25', 'rest'])

In [49]:
def process_tags(row):
    if int(row['max_grade']) == row['max_grade']:
        max_grade_tag = str(int(row['max_grade']))
    else:
        max_grade_tag = str(row['max_grade']).replace('.', '_')
    return f'max_grade_{max_grade_tag} ' + row['quantile_bin']
    
    
subset['tags'] = subset.apply(process_tags, axis=1)

## Export final results

In [50]:
subset['last_updated_on'] = str(dt.date.today())

subset = subset[['id', 'kanji', 'hint', 'reading', 'part_of_speech', 'english',
                 'remarks', 'max_grade', 'rank_quantile', 'jpn_sentence_1', 'eng_sentence_1',
                 'jpn_sentence_2', 'eng_sentence_2', 'jpn_sentence_3', 'eng_sentence_3', 'raw_meanings', 'clusters',
                 'last_updated_on', 'tags']].copy()



In [51]:
assert np.sum(subset['id'].duplicated()) == 0
assert len(selection) == len(subset)

In [52]:
subset.sample(n=5)

Unnamed: 0,id,kanji,hint,reading,part_of_speech,english,remarks,max_grade,rank_quantile,jpn_sentence_1,eng_sentence_1,jpn_sentence_2,eng_sentence_2,jpn_sentence_3,eng_sentence_3,raw_meanings,clusters,last_updated_on,tags
4019,eac15a4cb72e1b4f74cd5d86877640fdeccca02e,権限,,けんげん,noun (common) (futsuumeishi),"power, authority",,6.0,6,彼らを解雇する権限をくれた。,He gave me authority to fire them.,民衆の熱狂的な彼への支持は、首相の座にとどまりながらも大統領の権限の発揮を可能にしそうだ。,"Having reached the rank of prime minister, and...",彼らは権限を剥奪された。,The men have been shorn of their power.,"[power, authority, jurisdiction]","affinity - {0: ['power'], 1: ['authority', 'ju...",2021-08-16,max_grade_6 top_7
2070,579c51e5e704826f683000db70be3c941d21d9df,大正,,たいしょう,noun (common) (futsuumeishi),"Taishō era (1912.7.30-1926.12.25), Taisho era",,1.0,1,,,,,,,"[Taishō era (1912.7.30-1926.12.25), Taisho era]",dbscan - {},2021-08-16,max_grade_1 top_7
224,cabde55fa9630fce5102a86e08e28df1ec2a6a98,与る,,あずかる,Godan verb with 'ru' ending; intransitive verb,"to take part in, to receive, to enjoy",word usually written using kana alone,8.1,10,クラブの今日の発展にはこれらの人々の努力があずかって大いに力があった。,What this club is today is largely due to the ...,,,,,"[to participate in, to take part in, to play a...","affinity - {0: ['to participate in', 'to take ...",2021-08-16,max_grade_8_1 top_15
2218,e243f825822e676306bbfbfb2938fba4b044666a,姿勢,,しせい,noun (common) (futsuumeishi),"pose, carriage (of the body), stance",,6.0,5,聴衆は謹聴の姿勢で立っていた。,The listeners stood in an attentive attitude.,あまりに長く１つの姿勢で座っていた為に彼の筋肉はずきずきした。,His muscles ached from sitting too long in one...,熊が眠ったり横になるとき、その姿勢は熱を逃したいのか保ちたいのかによる。,"When bears sleep or lie down, their postures d...","[posture, pose, position, stance, carriage (of...","affinity - {0: ['pose'], 1: ['carriage (of the...",2021-08-16,max_grade_6 top_7
1464,49db6e3b9b91b47a95ee843e386ba762f7f1121e,収益,,しゅうえき,noun (common) (futsuumeishi),"returns, revenue",,6.0,10,経営陣は収益の短期的改善に気を取られすぎて、長期的な将来計画に気が回らない傾向があった。,The management tended to be too concerned abou...,会社の収益は飛躍的に増加した。,The company's profits soared.,旅行会社の収益が急増した。,Travel agencies' profits soared.,"[earnings, proceeds, returns, revenue]","affinity - {0: ['returns'], 1: ['earnings', 'p...",2021-08-16,max_grade_6 top_15


In [53]:
len(subset)

6989

In [54]:
file_name = f'{DATA_PATH}/compound_flashcards_lt{MAX_GRADE}_top_{MAX_QUANTILE}_pct.csv'
subset.to_csv(file_name, index=False, header=False)

file_name

'/Users/glillacci/OneDrive - Tesco/Personal/Data/kanjidata/compound_flashcards_lt8.4_top_15_pct.csv'