# Import & 필수 클래스 및 함수 정의

In [None]:
import json
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2Config
from tqdm import tqdm_notebook
from tokenizers import SentencePieceBPETokenizer
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
import math
import os
from korquad_qg.config import QGConfig
from korquad_qg.utils import TqdmLoggingHandler
from typing import List, NamedTuple, Optional, Tuple
from korquad_qg.dataset import MAX_QUESTION_SPACE, MIN_QUESTION_SPACE, QAExample, dynamic_padding_collate_fn

In [None]:
BOS = '<s>'
EOS = '</s>'
MASK = '<mask>'
PAD = '<pad>'

tokenizer = PreTrainedTokenizerFast(tokenizer_file='Tokenizer/tokenizer.json',
            bos_token=BOS, eos_token=EOS, unk_token='<unk>', 
            pad_token=PAD, mask_token=MASK)

In [None]:
GPTDecodingInputType = Tuple[torch.Tensor, torch.Tensor]
GPTInputsType = Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
GPTFeaturesType = Tuple[List[int], List[float], List[int]]

class QAExample(NamedTuple):
    context: str
    answer: str
    question: Optional[str] = None

def load_korquad_dataset(dataset_path: str) -> List[QAExample]:
    korquad = [pd.read_csv(dataset_path)]
    max_len = 512
    examples = []
    for document in korquad:
        for i in tqdm_notebook(range(len(document))):
            if len(TOKENIZER.tokenize(document["context"][i])) + 10 <= max_len:
                example = QAExample(document["context"][i], document["answers"][i], document["question"][i])
                examples.append(example)
        
    return examples
    
def dynamic_padding_collate_fn(features: List[GPTFeaturesType]) -> GPTInputsType:
    max_seq_len = max([len(feature[0]) for feature in features])
    input_ids, attention_mask, labels = [], [], []

    for feature in features:
        padded_input_ids = feature[0] + [0] * (max_seq_len - len(feature[0]))
        padded_attention_mask = feature[1] + [0.0] * (max_seq_len - len(feature[1]))
        padded_labels = feature[2] + [-100] * (max_seq_len - len(feature[2]))

        input_ids.append(padded_input_ids)
        attention_mask.append(padded_attention_mask)
        labels.append(padded_labels)

    return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(labels)

In [4]:
class HistoryQGDataset(Dataset):
    def __init__(
        self,
        examples: List[QAExample],
        tokenizer: SentencePieceBPETokenizer,
        max_sequence_length: int,
        is_train: bool = True,
    ) -> None:
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_sequence_length = max_sequence_length

        self.sos_token = tokenizer.convert_tokens_to_ids("<s>")
        self.eos_token = tokenizer.convert_tokens_to_ids("</s>")
        self.question_prefix_tokens = tokenizer.convert_tokens_to_ids('<q>')

        self.is_train = is_train

    def __getitem__(self, index: int) -> GPTFeaturesType:
        example = self.examples[index]

        context_tokens = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(f"<c>{example.context}"))
        answer_tokens = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(f"<a>{example.answer}"))
        question_tokens = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(f"{example.question}"))
        
        # [SOS] + 문맥:CONTEXT + 정답:ANSWER + 질문:
        conditional_tokens_len = 1 + len(context_tokens) + len(answer_tokens) + 1
        # QUESTION + [EOS]
        post_tokens_len = len(question_tokens) + 1

        if conditional_tokens_len + post_tokens_len > self.max_sequence_length:
            available_seq_len = (
                self.max_sequence_length - conditional_tokens_len - post_tokens_len + len(context_tokens)
            )
            context_tokens = context_tokens[:available_seq_len]

        conditional_tokens = [self.sos_token] + context_tokens + answer_tokens + [self.question_prefix_tokens]
        post_tokens = question_tokens + [self.eos_token]
        input_ids = conditional_tokens + post_tokens

        labels = input_ids if self.is_train else ([-100] * len(conditional_tokens)) + post_tokens
        attention_mask = [1.0] * len(input_ids)

        assert len(input_ids) <= self.max_sequence_length

        return input_ids, attention_mask, labels
        #return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(labels)

    def __len__(self) -> int:
        return len(self.examples)

In [5]:
class QGDecodingDataset(HistoryQGDataset):
    def __getitem__(self, index: int) -> GPTDecodingInputType:
        example = self.examples[index]

        context_tokens = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(f"<c>{example.context}"))
        answer_tokens = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(f"<a>{example.answer}"))

        # [SOS] + CONTEXT + ANSWER + 정답:
        conditional_tokens_len = 1 + len(context_tokens) + len(answer_tokens) + 1
        # ANSWER_SEQ + [EOS]
        post_tokens_len = MAX_QUESTION_SPACE + 1
        if conditional_tokens_len + post_tokens_len > self.max_sequence_length:
            available_seq_len = (
                self.max_sequence_length - conditional_tokens_len - post_tokens_len + len(context_tokens)
            )
            context_tokens = context_tokens[:available_seq_len]

        input_ids = [self.sos_token] + context_tokens + answer_tokens + [self.question_prefix_tokens]
        attention_mask = [1.0] * len(input_ids)
        return torch.tensor(input_ids), torch.tensor(attention_mask)

In [6]:
def QG(model):
    model = model.to(device)
    model.eval
6
    for i, batch in enumerate(dataloader):
        input_ids, attention_mask = tuple(v.to(device) for v in batch)
        origin_seq_len = input_ids.size(-1)

        decoded_sequences = model.generate(
            input_ids=input_ids,
            max_length=origin_seq_len + MAX_QUESTION_SPACE,
            min_length=origin_seq_len + MIN_QUESTION_SPACE,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            bos_token_id=tokenizer.bos_token_id,
            num_beams=5,
            repetition_penalty=2.0,
        )
        
        decoded_question_text = ''
        for decoded_tokens in decoded_sequences.tolist():

            decoded_question_text = tokenizer.decode(decoded_tokens[origin_seq_len:])
            decoded_question_text = decoded_question_text.split("</s>")[0].replace("<c>", "")
            
        print("- Context: ", examples[i].context)
        print("- Generated Question: ", decoded_question_text)
        print("- Answer: ", examples[i].answer)
        print()

IndentationError: unexpected indent (<ipython-input-6-13d665a7688d>, line 5)

# 문장 생성 모델 비교 
## KoGPT2 모델 vs After History Finetuning 모델

In [None]:
text = '이순신 장군이'

#  Load KoGPT2 model
kogpt_model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
input_ids = tokenizer.encode(text)
gen_ids = kogpt_model.generate(torch.tensor([input_ids]),
                           max_length=128,
                           repetition_penalty=2.0,
                           pad_token_id=tokenizer.pad_token_id,
                           eos_token_id=tokenizer.eos_token_id,
                           bos_token_id=tokenizer.bos_token_id,
                           use_cache=True)
kogpt2_generated = tokenizer.decode(gen_ids[0,:].tolist())

print("KoGPT2")
print(kogpt2_generated)
print()

# Load After History Finetuning model
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
model.load_state_dict(torch.load('hisotry_finetuning_outputs/gpt2_step_200000.pth', map_location="cpu"))
tokenizer = PreTrainedTokenizerFast(tokenizer_file='Tokenizer/tokenizer.json',
            bos_token=BOS, eos_token=EOS, unk_token='<unk>', 
            pad_token=PAD, mask_token=MASK)
input_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
gen_ids = model.generate(torch.tensor([input_ids]),
                           max_length=128,
                           repetition_penalty=2.0,
                           pad_token_id=tokenizer.pad_token_id,
                           eos_token_id=tokenizer.eos_token_id,
                           bos_token_id=tokenizer.bos_token_id,
                           use_cache=True)
generated = tokenizer.decode(gen_ids[0,:].tolist())

print("After History Fintuning: ")
print(generated)
print()

# 퀴즈 생성 비교 
## KoGPT2 vs After QA Finetuning Model vs After History&QA Model

In [None]:
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
model.load_state_dict(torch.load('outputs/After_History_Finetuning/gpt2_step_18000.pth'))

kogpt_model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')

after_QA_model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
after_QA_model.load_state_dict(torch.load('outputs/V2/gpt2_step_18000.pth'))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

examples = []

"""
with open('data/as_set.json') as f:
    data = json.load(f)
count = 0
for i in tqdm(data['data']):
    example = QAExample(i['doc'], i['answer'])
    examples.append(example)
    count += 1
    if count == 1:
        break
"""
        
example = QAExample(
                    '가군은 삼국시대 고구려 장안성 축성의 책임을 맡았던 관리. 가군은 생몰년 미상. 1964년에 평양 중구역 남문동에서 발견된 내성의 성벽돌에 새겨져 있는 명문에 따르면 가군은 성곽 축조시에 소형 관등을 가졌으며, 장안성 내성 중 일정 거리의 성벽 축조를 감독하였음을 알 수 있다.고구려 장안성은 522년에 쌓았고 589년에 장안성으로 천도하였다. 따라서 그의 활동 시기는 대략 6세기 후반 평원왕대로 추정된다. 또 명문의 괘루개절은 가군의 출신지나 직명으로 볼 수 있다.괘루는 고구려 5부의 하나인 계루부의 다른 표기로 볼 수도 있으나 아직 단정하기는 어려우며, 개절은 관직명의 일종으로 추정되고 있다. 이 명문이 새겨진 성벽돌은 현재 인민대학습당 내부를 지나는 성벽의 원위치에 그대로 있다',
                    '가군')
examples.append(example)
dataset = QGDecodingDataset(examples, tokenizer, 512)
dataloader = torch.utils.data.DataLoader(dataset, 1)

print("KoGPT2")
QG(kogpt_model)
    
print("After QA Finetuning Model")
QG(after_QA_model)
    
print("After History&QA Finetuning Model")
QG(model)