In [1]:
# Import Module
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [4]:
# Import Modules
import json
import time
import pickle
import argparse
import numpy as np
import sentencepiece as spm
from glob import glob
from tqdm import tqdm
from collections import Counter

# Import Custom Modules
from utils import terminal_size, train_test_split

In [5]:
parser = argparse.ArgumentParser(description='Parsing Method')
parser.add_argument('--max_len', default=300, type=int)
parser.add_argument('--save_path', default='./save3', 
                    type=str)
parser.add_argument('--data_path', default='../joseon_word_embedding/data', 
                    type=str, help='Crawling data path')
parser.add_argument('--data_split_per', default=0.2, type=float,
                    help='Train / Validation split ratio')
parser.add_argument('--pad_idx', default=0, type=int, help='Padding index')
parser.add_argument('--bos_idx', default=1, type=int, help='Start token index')
parser.add_argument('--eos_idx', default=2, type=int, help='End token index')
parser.add_argument('--unk_idx', default=3, type=int, help='Unknown token index')
parser.add_argument('--vocab_size', default=24000, type=int, help='Korean vocabulary size')
args = parser.parse_args(list())

In [6]:
#===================================#
#============Data Load==============#
#===================================#

print('Total list making...')
# 1) Path setting
data_list = glob(os.path.join(args.data_path, '*.json'))
data_list = sorted(data_list)[:-1] # 순종부록 제거

total_src_list = list()
total_trg_list = list()
total_king_list = list()

# 2) Total data making
for data_path in tqdm(data_list):
    # 2-1) Load data
    with open(data_path, 'r') as f:
        data_ = json.load(f)
    data_src_list = list()
    data_trg_list = list()
    # 2-2) Extract string data by length
    for x in data_:
        if len(x['hanja']) <= args.max_len:
            data_src_list.append(x['hanja'])
            data_trg_list.append(x['korean'])
    # 2-3) Total data setting
    total_src_list.extend(data_src_list)
    total_trg_list.extend(data_trg_list)
    # 2-4) King list setting
    king_id = int(data_path.split('data/')[1][:2]) - 1 # Start from 0
    total_king_list.extend([king_id for _ in range(len(data_src_list))])

#===================================#
#============Data Split=============#
#===================================#

# 1) Train / Test Split
split_src_record, split_trg_record, split_king_record = train_test_split(
    total_src_list, total_trg_list, total_king_list, split_percent=args.data_split_per)

# 2) Test / Valid Split
split_test_src_record, split_test_trg_record, split_test_king_record = train_test_split(
    split_src_record['test'], split_trg_record['test'], split_king_record['test'], split_percent=0.5)

print('Paired data num:')
print(f"train: {len(split_src_record['train'])}")
print(f"valid: {len(split_test_src_record['train'])}")
print(f"test: {len(split_test_src_record['test'])}")

0%|          | 0/27 [00:00<?, ?it/s]Total list making...
100%|██████████| 27/27 [00:13<00:00,  1.94it/s]
Paired data num:
train: 253860
valid: 31732
test: 31732


In [7]:
len(total_src_list)

317324

In [8]:
253860 + 31732 + 31732

317324

In [10]:
len_list = list()
for x in total_src_list:
    len_list.append(len(x))

In [16]:
sum(len_list) / len(len_list)

72.29334686314303

In [17]:
len_list = list()
for x in total_trg_list:
    len_list.append(len(x))

In [18]:
sum(len_list) / len(len_list)

148.90497724722997