In [54]:
import json
from typing import Any, Tuple
import pathlib
import numpy as np
import re
import random

In [15]:
def containsJapanese(text):
    re_kanji = re.compile(r'[\u3000-\u303f\u3040-\u309f\u30a0-\u30ff\uff00-\uff9f\u4e00-\u9faf\u3400-\u4dbf]')
    return re_kanji.search(text) is not None;

def containsKanji(text):
    re_kanji = re.compile(r'[\u4e00-\u9faf\u3400-\u4dbf]')
    return re_kanji.search(text) is not None;

def containsHiragana(text):
    re_hiragana = re.compile(r'[\u3040-\u309f]')
    return re_hiragana.search(text) is not None;

def get_state(text):
    if containsKanji(text):
        return 'kanji'
    elif containsHiragana(text):
        return 'hiragana'
    else:
        return 'other'

def insert_spaces(text):
    result = ""
    state = get_state(text[0])
    i = 0
    length = len(text)
    while i < length:
        if state != get_state(text[i]):
            state = get_state(text[i])
            result += ' '
        result += text[i]
        i += 1
    return result

In [4]:
with open('JMdict_e.json') as f:
    raw_data = json.load(f)

In [5]:
len(raw_data)

209106

In [6]:
filtered_data = [entry for entry in raw_data if 'k_ele' in entry.keys()]

In [7]:
len(filtered_data)

168741

In [8]:
filtered_data[14]

{'ent_seq': ['1000260'],
 'k_ele': [{'keb': ['悪どい'], 'ke_inf': ['&ateji;']},
  {'keb': ['灰汁どい'], 'ke_inf': ['&rK;']}],
 'r_ele': [{'reb': ['あくどい']}],
 'sense': [{'pos': ['&adj-i;'],
   'xref': ['あくが強い・2'],
   'misc': ['&uk;'],
   'gloss': ['gaudy', 'showy', 'garish', 'loud']},
  {'pos': ['&adj-i;'],
   'misc': ['&uk;'],
   'gloss': ['crooked',
    'vicious',
    'wicked',
    'nasty',
    'unscrupulous',
    'dishonest']}]}

In [22]:
dict = []
for entry in filtered_data:
    for k in entry['k_ele']:
        for r in entry['r_ele']:
            kanji = k['keb'][0]
            ruby = r['reb'][0]
            dict.append((kanji, ruby))

In [23]:
len(dict)

246863

In [105]:
sample_size = 500000
sample_length_mean = 30
sample_length_sd = 5
sample_n_array = np.round(np.random.normal(loc=sample_length_mean, scale=sample_length_sd, size=sample_size)).astype(int)

In [114]:
data = []
connectors = ['は','が','な','の','を','か','も','に','で','へ','と','や','から','まで','ね','よ']
for sample_n in sample_n_array:
    context_sample_array = []
    target_sample_array = []
    rand_connector_idx = random.randint(0, len(connectors) - 1)
    context_sample_array.append(connectors[rand_connector_idx])
    target_sample_array.append(connectors[rand_connector_idx])
    for i in range(sample_n.item()):
        rand_dict_idx = random.randint(0, len(dict) - 1)
        rand_connector_idx = random.randint(0, len(connectors) - 1)
        context_sample_array.append(dict[rand_dict_idx][0])
        target_sample_array.append(dict[rand_dict_idx][1])
        context_sample_array.append(connectors[rand_connector_idx])
        target_sample_array.append(connectors[rand_connector_idx])
    context_sample = ''.join(context_sample_array)
    target_sample = ''.join(target_sample_array)
    data.append(''.join(context_sample) + '|' + ''.join(target_sample) + '\n')

In [115]:
data[0]

'は荷が重いね徴税吏を大腸炎や薺な保障付ね生活態度の複素環基も確認要で格好をつけると被裏書人か紅色球形形象物よ合成石油は商売仲間は大作へ有界ね貝おおいよせせり箸までかぎ鼻へ釣果から高張り提灯から|はにがおもいねちょうぜいりをだいちょうえんやなずななほしょうつきねせいかつたいどのふくそかんきもかくにんようでかっこをつけるとひうらがきにんかべにいろきゅうけいけいしょうぶつよごうせいせきゆはしょうばいなかまはたいさくへゆうかいねかいおいよせせりばしまでかぎばなへちょうかからたかはりぢょうちんから\n'

In [116]:
local_path = 'furigana_training_data.txt'
output = open(local_path, 'w')
output.writelines(data)
output.close()

In [71]:
def load_data(path):
    text = path.read_text(encoding='utf-8')
    
    lines = text.splitlines()
    pairs = [line.split('|') for line in lines]
    
    context = np.array([context for target, context in pairs])
    target = np.array([target for target, context in pairs])
    
    return target, context

In [74]:
path_to_file = pathlib.Path(local_path)
context, target = load_data(path_to_file)

〃|おなじ


'〃'