In [58]:
import json
import re
import pandas as pd
import nltk
import numpy as np
from glob import glob
from tqdm import tqdm
import time
from nltk import sent_tokenize

In [3]:
def clean_text(text):
    lists = re.findall(r"【[^【^】]*】", text)
    for word in lists:
        word_ = re.sub("\s+", "", word)
        word_ = re.sub("【", "[", word_)
        word_ = re.sub("】", "]", word_)
        text = text.replace(word, " " + word_ + ":")
        
    text = text.replace("\'", '"')
    text = text.replace("\u2018", "'")
    text = text.replace("\u2019", "'")
    text = re.sub(r"\s+", " ", text)
    text = text.replace(". ", ".")
    text = re.sub(r"([ㄱ-ㅎㅏ-ㅣ가-힣])\.([\S])", r"\1. \2", text)
    text = re.sub(r"([\S])\.([ㄱ-ㅎㅏ-ㅣ가-힣])", r"\1. \2", text)
    text = text.strip()
    
    return text

In [4]:
data = json.load(open("data/판례/판례.json", "r", encoding="utf-8"))

In [6]:
precedent_data = []

In [7]:
for precedent_id, precedent in data.items():
    for key, value in precedent.items():
        if key in ["http://www.aihub.or.kr/kb/law/precedentText", "http://www.aihub.or.kr/kb/law/judgementNote"]:
            precedent_data.append(clean_text(value[0]["value"]))
            if len(value) > 1:
                print(precedent_id)

In [8]:
with open("data/precedent_data.txt", "w", encoding="utf-8") as f:
    for row in precedent_data:
        f.write(row+"\n")

In [10]:
df = pd.read_excel("data/상담데이터/law_talk_question_all.xlsx")
consulting_data = list(df["content"])

In [12]:
for row in consulting_data:
    if type(row) != str:
        consulting_data.remove(row)

In [13]:
for idx, row in enumerate(consulting_data):
    row = row.replace("\n", " ")
    row = row.replace("\t", " ")
    row = re.sub(r"\s+", " ", row)
    row = row.strip()
    consulting_data[idx] = row

In [14]:
with open("data/consulting_data.txt", "w", encoding="utf-8") as f:
    for row in consulting_data:
        f.write(row+"\n")

In [20]:
training_data = []
for file in glob("data/*.txt"):
    with open(file, "r", encoding="utf-8") as f:
        training_data += f.read().split("\n")[:-1]

In [21]:
with open("data/gpt/data.txt", "w", encoding="utf-8") as f:
    for row in training_data:
        f.write(row+"\n")

In [23]:
from sentencepiece import SentencePieceProcessor as sp

In [24]:
tokenizer = sp("save/kogpt2.sp")

In [25]:
new_data = []
num_tokens = 0

In [26]:
min_token_length = 5
max_token_length = 512

In [27]:
with open("data/gpt/data.txt", "r", encoding="utf-8") as f:
    data = f.read().split("\n")

In [60]:
import time
from tqdm import tqdm

start = time.time()
for row in tqdm(data, total=len(data)):
    tokens = []
    if len(tokenizer.EncodeAsPieces(row)) + 1 <= min_token_length:
        continue
    sentences = sent_tokenize(row)
    for sent in sentences:
        tokens_ = tokenizer.EncodeAsPieces(sent)
        if len(tokens) + len(tokens_) <= max_token_length:
            tokens += tokens_
        else:
            new_data.append(clean_text(tokenizer.decode(tokens)))
            num_tokens += len(tokens)
            tokens = tokens_[:max_token_length]
    new_data.append(clean_text(tokenizer.decode(tokens)))
    num_tokens += len(tokens)
    tokens = []
with open("data/gpt/data_processed.txt", "w", encoding="utf-8") as f:
    for row in new_data:
        f.write(row+"\n")
end = time.time()
print(end-start)

100%|██████████| 148359/148359 [19:25<00:00, 127.31it/s]


1168.4680650234222


In [61]:
num_tokens

125272793

In [62]:
len(new_data)

356398

In [63]:
import random
random.seed(1)
random.shuffle(new_data)

In [64]:
with open("data/gpt/train_data.txt", "w") as f:
    for row in new_data[:-30000]:
        f.write(row+"\n")
    f.close()
with open("data/gpt/dev_data.txt", "w") as f:
    for row in new_data[-30000:]:
        f.write(row+"\n")
    f.close()