In [None]:
from transformers import GPT2Model, GPT2Config

# Initializing a GPT2 configuration
configuration = GPT2Config()

# Initializing a model from the configuration
model = GPT2Model(config=GPT2Config.from_json_file('config.json'))

# Accessing the model configuration
configuration = model.config

In [None]:
model

In [None]:
configuration

In [2]:
from torch.utils.data import DataLoader, Dataset
from tokenizers import SentencePieceBPETokenizer
import json
import pandas as pd
import numpy as np
import logging
import torch
from torch.utils.data import DataLoader, Dataset
#from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel
import logging
import math
import os
from argparse import ArgumentParser
import torch
from tokenizers import SentencePieceBPETokenizer
from torch.optim import Adam
from tqdm import tqdm_notebook
from transformers import GPT2LMHeadModel, get_linear_schedule_with_warmup
from korquad_qg.config import QGConfig
from korquad_qg.utils import TqdmLoggingHandler
from typing import List, NamedTuple, Optional, Tuple

# 코쿼드 데이터셋을 이용한 GPT2 데이터셋 생성

In [4]:
with open('KorQuAD_v1.0_train.json', 'r') as f:

    json_data = json.load(f)
print(len(json_data))

2


In [3]:
with open('KorQuAD_v1.0_train.json', 'r') as f:

    json_data = json.load(f)

context_list = []
answers_list = []
question_list = []

for data in json_data['data']:
    for sub_data in data['paragraphs']:
        context = sub_data['context']
        for qa in sub_data['qas']:
            context_list.append(context)
            answers_list.append(qa['answers'][0]['text'])
            question_list.append(qa['question'])
            if len(qa['answers']) > 1:
                print(qa['answers'])
                
data = {
        'context':context_list,
        'answers': answers_list,
        'question':question_list
    }
df = pd.DataFrame(data)
df.to_csv('KorQuad_dev_V1.csv',index=False)

# 토크나이저 생성

In [None]:
Q_TKN = '<q>'
A_TKN = '<a>'
BOS = '<s>'
EOS = '</s>'
MASK = '<mask>'
C_TKN = '<c>'
PAD = '<pad>'

In [None]:
txt_f = open("after_mecab.txt", 'r')
all_dict = []
max_len = 0
count = 0
for line in tqdm_notebook(txt_f.readlines()):
    if '\n' in line:
        line = line.replace('\n', '')
    if '##' in line:
        line = line.replace('##', '')
    all_dict.append(line)
    count += 1
print(count)


f = open("data_save_test.txt", 'w')
for i in all_dict:
    f.write(i+'\n')
f.close()

# Initialize a tokenizer
tokenizer = SentencePieceBPETokenizer()
# Then train it!

tokenizer.train_from_iterator([all_dict], vocab_size=3000000, min_frequency=1, limit_alphabet=100000,
            special_tokens=[PAD, BOS, EOS, MASK,'<unk>', '<q>', '<a>', '<c>'])

tokenizer.save("./history_Korquad_tokenizer.json",pretty=True)
tokenizer.save_model(directory='./')

In [None]:
txt_f = open("sentences.txt", 'r')
max_len = 512
count = 0
tok = PreTrainedTokenizerFast(tokenizer_file='Tokenizer/History/history_tokenizer.json')
for line in tqdm_notebook(txt_f.readlines()):
    if '\n' in line:
        line = line.replace('\n', '')
    if '##' in line:
        line = line.replace('##', '')
    if len(tok.tokenize(line)) + 7 > max_len:
        max_len = len(line)
        count += 1
print(max_len, count)

In [None]:
tok = PreTrainedTokenizerFast(tokenizer_file='Tokenizer/ History_Korquad_Tokenizer/history_Korquad_tokenizer.json')

In [None]:
tok.convert_tokens_to_ids("<s>")

# 데이터셋 생성

In [None]:
Q_TKN = '<q>'
A_TKN = '<a>'
BOS = '</s>'
EOS = '</s>'
MASK = '<mask>'
C_TKN = '<c>'
PAD = '<pad>'
TOKENIZER = PreTrainedTokenizerFast(tokenizer_file='Tokenizer/tokenizer.json',
            bos_token=BOS, eos_token=EOS, unk_token='<unk>', 
            pad_token=PAD, mask_token=MASK)

# 데이터 셋 클래스

In [None]:
GPTDecodingInputType = Tuple[torch.Tensor, torch.Tensor]
GPTInputsType = Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
GPTFeaturesType = Tuple[List[int], List[float], List[int]]

class QAExample(NamedTuple):
    context: str
    answer: str
    question: Optional[str] = None

def load_korquad_dataset(dataset_path: str) -> List[QAExample]:
    korquad = [pd.read_csv(dataset_path)]
    max_len = 512
    examples = []
    for document in korquad:
        for i in tqdm_notebook(range(len(document))):
            if len(TOKENIZER.tokenize(document["context"][i])) + 10 <= max_len:
                example = QAExample(document["context"][i], document["answers"][i], document["question"][i])
                examples.append(example)
        
    return examples
    
def dynamic_padding_collate_fn(features: List[GPTFeaturesType]) -> GPTInputsType:
    max_seq_len = max([len(feature[0]) for feature in features])
    input_ids, attention_mask, labels = [], [], []

    for feature in features:
        padded_input_ids = feature[0] + [0] * (max_seq_len - len(feature[0]))
        padded_attention_mask = feature[1] + [0.0] * (max_seq_len - len(feature[1]))
        padded_labels = feature[2] + [-100] * (max_seq_len - len(feature[2]))

        input_ids.append(padded_input_ids)
        attention_mask.append(padded_attention_mask)
        labels.append(padded_labels)

    return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(labels)

In [None]:

class HistoryQGDataset(Dataset):
    def __init__(self, examples: List[QAExample], max_len=32):
        self.data = examples
        self.first = True
        self.c_token = C_TKN
        self.a_token = A_TKN
        self.q_token = Q_TKN
        self.bos = BOS
        self.eos = EOS
        self.mask = MASK
        self.pad = PAD
        self.max_len = max_len
        self.tokenizer = TOKENIZER 

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx)-> GPTFeaturesType:
        turn = self.data[idx]
        c = turn.context
        a = turn.answer
        q = turn.question

        ca_toked = self.tokenizer.tokenize(self.c_token + c + self.bos+ self.a_token + a + self.bos)
        ca_len = len(ca_toked)
        q_toked = self.tokenizer.tokenize(self.q_token + q + self.eos)
        q_len = len(q_toked)

        if ca_len + q_len > self.max_len:
            q_len = self.max_len - ca_len
            if q_len <= 0:
                ca_toked = ca_toked[-(int(self.max_len/2)):]
                ca_len = len(ca_toked)
                q_len = self.max_len - ca_len
                assert q_len > 0
            q_toked = q_toked[:q_len]
            q_len = len(q_toked)
            assert q_len == len(q_toked), f'{q_len} ==? {len(q_toked)}'


        labels = [
            self.mask,
        ] * ca_len + q_toked[1:]

        if self.first:
            logging.info("contexts : {}".format(c))
            logging.info("toked ctx: {}".format(ca_toked))
            logging.info("response : {}".format(q))
            logging.info("toked response : {}".format(q_toked))
            logging.info('labels {}'.format(labels))
            self.first = False

        mask = [0] * ca_len + [1] * q_len + [0] * (self.max_len - ca_len - q_len)
        self.max_len
        
        labels_ids = self.tokenizer.convert_tokens_to_ids(labels)
        while len(labels_ids) < self.max_len:
            labels_ids += [self.tokenizer.pad_token_id]

        token_ids = self.tokenizer.convert_tokens_to_ids(ca_toked + q_toked)
        while len(token_ids) < self.max_len:
            token_ids += [self.tokenizer.pad_token_id]

        return token_ids, np.array(mask), labels_ids




In [None]:
class HistoryQGDataset2(Dataset):
    def __init__(
        self,
        examples: List[QAExample],
        tokenizer: SentencePieceBPETokenizer,
        max_sequence_length: int,
        is_train: bool = True,
    ) -> None:
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_sequence_length = max_sequence_length

        self.sos_token = tokenizer.convert_tokens_to_ids("<c>")
        self.eos_token = tokenizer.convert_tokens_to_ids("</s>")
        self.question_prefix_tokens = tokenizer.convert_tokens_to_ids('<q>')

        self.is_train = is_train

    def __getitem__(self, index: int) -> GPTFeaturesType:
        example = self.examples[index]

        context_tokens = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(f"<c>{example.context}"))
        answer_tokens = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(f"<a>{example.answer}"))
        question_tokens = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(f"{example.question}"))
        
        # [SOS] + 문맥:CONTEXT + 정답:ANSWER + 질문:
        conditional_tokens_len = 1 + len(context_tokens) + len(answer_tokens) + 1
        # QUESTION + [EOS]
        post_tokens_len = len(question_tokens) + 1

        if conditional_tokens_len + post_tokens_len > self.max_sequence_length:
            available_seq_len = (
                self.max_sequence_length - conditional_tokens_len - post_tokens_len + len(context_tokens)
            )
            context_tokens = context_tokens[:available_seq_len]

        conditional_tokens = [self.sos_token] + context_tokens + answer_tokens + [self.question_prefix_tokens]
        post_tokens = question_tokens + [self.eos_token]
        input_ids = conditional_tokens + post_tokens

        labels = input_ids if self.is_train else ([-100] * len(conditional_tokens)) + post_tokens
        attention_mask = [1.0] * len(input_ids)

        assert len(input_ids) <= self.max_sequence_length

        return input_ids, attention_mask, labels

    def __len__(self) -> int:
        return len(self.examples)


In [None]:
len(10)

In [None]:
def _create_logger(output_dir: str):
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter("[%(asctime)s] %(message)s")

    file_handler = logging.FileHandler(os.path.join(output_dir, "train.log"))
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

    handler = TqdmLoggingHandler()
    handler.setFormatter(logging.Formatter("%(asctime)s - %(message)s"))
    logger.addHandler(handler)
    return logger

In [None]:
parser = ArgumentParser()
parser.add_argument("--train-dataset", type=str, help="학습 데이터 경로dd")
parser.add_argument("--dev-dataset", type=str, help="평가 데이터 경로")

parser.add_argument("--epochs", type=int, help="학습 전체를 반복할 횟수")
parser.add_argument("--lr", type=float, help="learning rate")

parser.add_argument("--train-batch-size", type=int, help="학습에 사용할 배치 크기")
parser.add_argument("--eval-batch-size", type=int, help="평가에 사용할 배치 크기")
parser.add_argument("--validation-interval", type=int, help="dev 셋에 대해서 validation 을 수행할 steps")
parser.add_argument("--save-interval", type=int, help="모델을 저장할 steps")

parser.add_argument("--output-dir", type=str, default="artifacts/", help="모델과 학습 로그를 저장할 경로")
config = QGConfig()

tokenizer = PreTrainedTokenizerFast(tokenizer_file='Tokenizer/tokenizer.json',
            bos_token=BOS, eos_token=EOS, unk_token='<unk>', 
            pad_token=PAD, mask_token=MASK)

logger = _create_logger(output_dir=config.output_dir)
logger.info("============================")
for key, value in config._asdict().items():
    logger.info(f"{key:30}:{value}")
logger.info("============================")
torch.manual_seed(config.random_seed)
logger.info("loading train dataset")



train_examples = load_korquad_dataset(config.train_dataset)
train_dataset = HistoryQGDataset2(train_examples, tokenizer, config.max_sequence_length)
train_dataloader = torch.utils.data.DataLoader(train_dataset, 2, shuffle=True, collate_fn=dynamic_padding_collate_fn)

# model 생성
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
model.resize_token_embeddings()
print(model.transformer.wte.weight.shape[0], len(TOKENIZER.vocab))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = Adam(model.parameters(), lr=config.lr)
total_steps = len(train_dataloader) * config.epochs
warmup_steps = int(total_steps * config.warmup_ratio)
scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)

loss_list_between_log_interval = []
for epoch_id in range(config.epochs):
   for step_index, batch_data in tqdm_notebook(
            enumerate(train_dataloader), f"[TRAIN] EP:{epoch_id}", total=len(train_dataloader)
    ):
        global_step = len(train_dataloader) * epoch_id + step_index + 1
        optimizer.zero_grad()

        token_ids, attention_mask, labels = tuple(value.to(device) for value in batch_data)
        print(token_ids.shape, attention_mask.shape, labels.shape)
        model_outputs = model.forward(token_ids, attention_mask=attention_mask, labels=labels, return_dict=True)
        model_outputs.loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip)
        optimizer.step()
        scheduler.step()

        # for logging
        loss_list_between_log_interval.append(model_outputs.loss.item())

        if global_step % config.train_log_interval == 0:
            mean_loss = np.mean(loss_list_between_log_interval)
            loss_list_between_log_interval.clear()

        if global_step % config.save_interval == 0:
            state_dict = model.state_dict()
            model_path = os.path.join(config.output_dir, f"gpt2_step_{global_step}.pth")
            torch.save(state_dict, model_path)

# 검증

In [2]:
with open('/home/wowns/data/KoreanHistoryProject/QA/QA_Original/as_generate_set.json', 'r') as f:

    json_data = json.load(f)
with open('/home/wowns/data/KoreanHistoryProject/QA/QA_Original/as_generate_set2.json', 'r') as f:

    json_data2 = json.load(f)

all_list = []
count = 0

for i in json_data["data"]:
    all_list.append(i)
    count +=1

for i in json_data2["data"]:
    all_list.append(i)
    count +=1

print(count)

332430


In [3]:
from KoHis import KoHisQnA
qa = KoHisQnA()  

count = 0

correct_quiz = []
correct_count = 0

except_list = []
except_count = 0

for i in tqdm_notebook(range(332430)):
    try:
        tuple_answer = qa.do_ask_to_model(all_list[i]['generated_question'], all_list[i]['doc'])
        predict_answer = tuple_answer[2]
    except: ## 문서의 길이가 너무 길경우
        except_list.append(all_list[i])
        except_count += 1
    if "#" in predict_answer:
        predict_answer = predict_answer.replace("#", "")
    if all_list[i]['answer'] == predict_answer:

        correct_quiz.append(all_list[i])
        correct_count += 1
    if count % 1000 == 0:
       print("step: ", count, "correct_count: ", correct_count, "except_count", except_count)
    count += 1
        
print("correct_count: ", correct_count, "except_count", except_count)

...initailizing...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if sys.path[0] == '':


  0%|          | 0/332430 [00:00<?, ?it/s]

step:  0 correct_count:  1 except_count 0
step:  1000 correct_count:  151 except_count 15
step:  2000 correct_count:  299 except_count 30
step:  3000 correct_count:  428 except_count 45
step:  4000 correct_count:  570 except_count 60
step:  5000 correct_count:  708 except_count 105
step:  6000 correct_count:  819 except_count 135
step:  7000 correct_count:  937 except_count 194
step:  8000 correct_count:  1088 except_count 269
step:  9000 correct_count:  1244 except_count 314
step:  10000 correct_count:  1374 except_count 341
step:  11000 correct_count:  1498 except_count 371
step:  12000 correct_count:  1641 except_count 371
step:  13000 correct_count:  1794 except_count 386
step:  14000 correct_count:  1939 except_count 401
step:  15000 correct_count:  2073 except_count 416
step:  16000 correct_count:  2219 except_count 416
step:  17000 correct_count:  2339 except_count 431
step:  18000 correct_count:  2507 except_count 431
step:  19000 correct_count:  2616 except_count 446
step:  20

In [6]:
correct_quiz[0]

{'doc': '가군은 삼국시대 고구려 장안성 축성의 책임을 맡았던 관리. 가군은 생몰년 미상. 1964년에 평양 중구역 남문동에서 발견된 내성의 성벽돌에 새겨져 있는 명문에 따르면 가군은 성곽 축조시에 소형 관등을 가졌으며, 장안성 내성 중 일정 거리의 성벽 축조를 감독하였음을 알 수 있다.고구려 장안성은 522년에 쌓았고 589년에 장안성으로 천도하였다. 따라서 그의 활동 시기는 대략 6세기 후반 평원왕대로 추정된다. 또 명문의 괘루개절은 가군의 출신지나 직명으로 볼 수 있다.괘루는 고구려 5부의 하나인 계루부의 다른 표기로 볼 수도 있으나 아직 단정하기는 어려우며, 개절은 관직명의 일종으로 추정되고 있다. 이 명문이 새겨진 성벽돌은 현재 인민대학습당 내부를 지나는 성벽의 원위치에 그대로 있다.',
 'answer': '가군',
 'generated_question': '522년 고구려 장안성 축성에 책임을 맡은 관리들은 누구인가?'}

In [7]:
context_list = []
answers_list = []
question_list = []

for data in correct_quiz:
    context_list.append(data['doc'])
    answers_list.append(data['answer'])
    question_list.append(data['generated_question'])
            
data = {
        'context':context_list,
        'answers': answers_list,
        'question':question_list
    }
df = pd.DataFrame(data)
df.to_csv('History_train_V1.csv',index=False)