In [1]:
import json
from typing import Any, Tuple
import pathlib
import numpy as np
import re
import random
import pykakasi

In [2]:
def containsJapanese(text):
    re_kanji = re.compile(r'[\u3000-\u303f\u3040-\u309f\u30a0-\u30ff\uff00-\uff9f\u4e00-\u9faf\u3400-\u4dbf]')
    return re_kanji.search(text) is not None;

def containsKanji(text):
    re_kanji = re.compile(r'[\u4e00-\u9faf\u3400-\u4dbf]')
    return re_kanji.search(text) is not None;

def containsHiragana(text):
    re_hiragana = re.compile(r'[\u3040-\u309f]')
    return re_hiragana.search(text) is not None;

def get_state(text):
    if containsKanji(text):
        return 'kanji'
    elif containsHiragana(text):
        return 'hiragana'
    else:
        return 'other'

def insert_spaces(text):
    result = ""
    state = get_state(text[0])
    i = 0
    length = len(text)
    while i < length:
        if state != get_state(text[i]):
            state = get_state(text[i])
            result += ' '
        result += text[i]
        i += 1
    return result

def random_hiragana():
    hiragana_list = ['ぁ', 'あ', 'ぃ', 'い', 'ぅ', 'う', 'ぇ', 'え', 'ぉ', 'お', 'か', 'が', 'き', 'ぎ', 'く', 'ぐ', 'け', 'げ', 'こ', 'ご', 'さ', 'ざ', 'し', 'じ', 'す', 'ず', 'せ', 'ぜ', 'そ', 'ぞ', 'た', 'だ', 'ち', 'ぢ', 'っ', 'つ', 'づ', 'て', 'で', 'と', 'ど', 'な', 'に', 'ぬ', 'ね', 'の', 'は', 'ば', 'ぱ', 'ひ', 'び', 'ぴ', 'ふ', 'ぶ', 'ぷ', 'へ', 'べ', 'ぺ', 'ほ', 'ぼ', 'ぽ', 'ま', 'み', 'む', 'め', 'も', 'ゃ', 'や', 'ゅ', 'ゆ', 'ょ', 'よ', 'ら', 'り', 'る', 'れ', 'ろ', 'わ', 'を', 'ん', 'ゔ']
    rand_choice_idx = random.randint(0, len(hiragana_list) - 1)
    return hiragana_list[rand_choice_idx]

In [3]:
with open('JMdict_e.json') as f:
    raw_data = json.load(f)

In [4]:
len(raw_data)

209106

In [5]:
# remove non-kanji entries
filtered_data = [entry for entry in raw_data if 'k_ele' in entry.keys()]

In [6]:
len(filtered_data)

168741

In [7]:
filtered_data[14]

{'ent_seq': ['1000260'],
 'k_ele': [{'keb': ['悪どい'], 'ke_inf': ['&ateji;']},
  {'keb': ['灰汁どい'], 'ke_inf': ['&rK;']}],
 'r_ele': [{'reb': ['あくどい']}],
 'sense': [{'pos': ['&adj-i;'],
   'xref': ['あくが強い・2'],
   'misc': ['&uk;'],
   'gloss': ['gaudy', 'showy', 'garish', 'loud']},
  {'pos': ['&adj-i;'],
   'misc': ['&uk;'],
   'gloss': ['crooked',
    'vicious',
    'wicked',
    'nasty',
    'unscrupulous',
    'dishonest']}]}

In [8]:
data_dict = {}
for entry in filtered_data:
    for k_ele in entry['k_ele']:
        kanji = k_ele['keb'][0]
        ruby = entry['r_ele'][0]['reb'][0]
        if kanji not in data_dict.keys():
            data_dict[kanji] = ruby

In [11]:
data_dict['何で']

'なんで'

In [None]:
for key in list(data_dict.keys())[:50]:
    print(key, data_dict[key])

In [None]:
num_samples = 5_000_000

sample_n_options = [1,2,3]
sample_n_weights = [85,10,5]
sample_n_list = random.choices(sample_n_options, weights=sample_n_weights, k=num_samples)

In [None]:
data_list = []
dict_length = len(data_dict)
dict_keys = list(data_dict.keys())
for sample_n in sample_n_list:
    sample_context = []
    sample_target = []

    if random.random() < 0.5:
        rand_hiragana = random_hiragana()
        sample_context.append(rand_hiragana)
        sample_target.append(rand_hiragana)
    
    for i in range(sample_n):
        rand_entry_idx = random.randint(0, dict_length - 1)
        rand_entry_key = dict_keys[rand_entry_idx]
        rand_entry_value = data_dict[rand_entry_key]
        sample_context.append(rand_entry_key)
        sample_target.append(rand_entry_value)

    if random.random() < 0.5:
        rand_hiragana = random_hiragana()
        sample_context.append(rand_hiragana)
        sample_target.append(rand_hiragana)
    
    data_list.append(''.join(sample_context) + '|' + ''.join(sample_target) + '\n')

In [None]:
len(data_list)

In [None]:
data_list[-10:]

In [None]:
local_path = 'furigana_training_data.txt'
output = open(local_path, 'w')
output.writelines(data_list)
output.close()

In [None]:
def load_data(path):
    text = path.read_text(encoding='utf-8')
    
    lines = text.splitlines()
    pairs = [line.split('|') for line in lines]
    
    context = np.array([context for target, context in pairs])
    target = np.array([target for target, context in pairs])
    
    return target, context

In [None]:
path_to_file = pathlib.Path(local_path)
context, target = load_data(path_to_file)