In [1]:
# Import Modules
import os
import json
import time
import pickle
import argparse
import numpy as np
import sentencepiece as spm
from glob import glob
from tqdm import tqdm
from collections import Counter

# Import Custom Modules
from utils import terminal_size, train_test_split

In [2]:
parser = argparse.ArgumentParser(description='Parsing Method')
parser.add_argument('--max_len', default=300, type=int)
parser.add_argument('--save_path', default='./save', 
                    type=str)
parser.add_argument('--data_path', default='../joseon_word_embedding/data', 
                    type=str, help='Crawling data path')
parser.add_argument('--data_split_per', default=0.2, type=float,
                    help='Train / Validation split ratio')
parser.add_argument('--pad_idx', default=0, type=int, help='Padding index')
parser.add_argument('--bos_idx', default=1, type=int, help='Start token index')
parser.add_argument('--eos_idx', default=2, type=int, help='End token index')
parser.add_argument('--unk_idx', default=3, type=int, help='Unknown token index')
parser.add_argument('--vocab_size', default=32000, type=int, help='Korean vocabulary size')
args = parser.parse_args(list())

In [3]:
#===================================#
#============Data Load==============#
#===================================#

print('Total list making...')
# 1) Path setting
data_list = glob(os.path.join(args.data_path, '*.json'))
data_list = sorted(data_list)[:-1] # 순종부록 제거

total_src_list = list()
total_trg_list = list()
total_king_list = list()

# 2) Total data making
for data_path in tqdm(data_list):
    # 2-1) Load data
    with open(data_path, 'r') as f:
        data_ = json.load(f)
    data_src_list = list()
    data_trg_list = list()
    # 2-2) Extract string data by length
    for x in data_:
        if len(x['hanja']) <= args.max_len:
            data_src_list.append(x['hanja'])
            data_trg_list.append(x['korean'])
    # 2-3) Total data setting
    total_src_list.extend(data_src_list)
    total_trg_list.extend(data_trg_list)
    # 2-4) King list setting
    king_id = int(data_path.split('data/')[1][:2]) - 1 # Start from 0
    total_king_list.extend([king_id for _ in range(len(data_src_list))])

#===================================#
#============Data Split=============#
#===================================#

split_src_record, split_trg_record, split_king_record = train_test_split(
    total_src_list, total_trg_list, total_king_list, split_percent=args.data_split_per)

print('Paired data num:')
print(f"train: {len(split_src_record['train'])}")
print(f"test: {len(split_src_record['test'])}")

#====================================#
#==========DWE Results Open==========#
#====================================#

with open(os.path.join(args.save_path, 'hj_word2id.pkl'), 'rb') as f:
    hanja_word2id = pickle.load(f)

  7%|▋         | 2/27 [00:00<00:01, 19.44it/s]

Total list making...


100%|██████████| 27/27 [00:16<00:00,  1.66it/s]


Paired data num:
train: 253860
test: 63464


In [6]:
hanja_word2id

{'<pad>': 0,
 '<s>': 1,
 '</s>': 2,
 '<unk>': 3,
 '夙': 4,
 '駕': 5,
 ' ': 6,
 '獵': 7,
 '于': 8,
 '西': 9,
 '郊': 10,
 '命': 11,
 '有': 12,
 '司': 13,
 '攝': 14,
 '事': 15,
 '宗': 16,
 '廟': 17,
 '憲': 18,
 '府': 19,
 '大': 20,
 '李': 21,
 '舒': 22,
 '等': 23,
 '上': 24,
 '疏': 25,
 '請': 26,
 '勿': 27,
 '輕': 28,
 '改': 29,
 '法': 30,
 '令': 31,
 '數': 32,
 '宥': 33,
 '罪': 34,
 '人': 35,
 '北': 36,
 '面': 37,
 '都': 38,
 '節': 39,
 '制': 40,
 '使': 41,
 '崔': 42,
 '永': 43,
 '沚': 44,
 '執': 45,
 '送': 46,
 '挈': 47,
 '家': 48,
 '來': 49,
 '投': 50,
 '一': 51,
 '名': 52,
 '善': 53,
 '吹': 54,
 '簫': 55,
 '曰': 56,
 '平': 57,
 '申': 58,
 '嚴': 59,
 '金': 60,
 '銀': 61,
 '彩': 62,
 '段': 63,
 '之': 64,
 '禁': 65,
 '視': 66,
 '朝': 67,
 '以': 68,
 '天': 69,
 '變': 70,
 '撤': 71,
 '樂': 72,
 '鵓': 73,
 '鴿': 74,
 '三': 75,
 '雙': 76,
 '日': 77,
 '本': 78,
 '國': 79,
 '九': 80,
 '州': 81,
 '度': 82,
 '源': 83,
 '了': 84,
 '俊': 85,
 '從': 86,
 '其': 87,
 '也': 88,
 '罷': 89,
 '將': 90,
 '軍': 91,
 '崇': 92,
 '職': 93,
 '囚': 94,
 '前': 95,
 '典': 96,
 '書': 97,
 '千': 98,
 '具': 9

In [8]:
index

'諫院 【正言李渭輔。 】 申前啓, 不允。 又啓: "各司官稍有勢力者, 每當輪對之次, 或恐醜拙之露, 圖囑規免, 只使鄕曲疲殘之類, 苟且充數, 所管弊瘼, 不敢陳奏。 請令該曹更加修整, 輪次入對, 申明法意。 " 又啓: "躁競之風, 武弁尤甚。 別薦之目, 大開驟陞之捷路, 乳臭之兒, 遽躐閫薦, 矇騃之類, 遍置剡案。 請嚴飭三軍門, 申明勸武舊制, 俾杜濫薦。 " 從之。'

In [7]:
#===================================#
#=======Hanja Pre-processing========#
#===================================#

# 1) Hanja sentence parsing setting
print('Hanja sentence parsing...')
start_time = time.time()
hj_parsed_indices_train = list()
hj_parsed_indices_test = list()

# 2) Parsing sentence
# 2-1) Train data parsing
print('Train data start...')
for index in tqdm(split_src_record['train']):
    parsed_index = list()
    parsed_index.append(args.bos_idx) # Start token add
    for ind in index:
        try:
            parsed_index.append(hanja_word2id[ind])
        except KeyError:
            parsed_index.append(hanja_word2id['<unk>'])
    parsed_index.append(args.eos_idx) # End token add
    hj_parsed_indices_train.append(parsed_index)

  0%|          | 0/253860 [00:00<?, ?it/s]

Hanja sentence parsing...
Train data start...



