# Import

In [1]:
from torch.utils.data import DataLoader, Dataset
from tokenizers import SentencePieceBPETokenizer
import json
import pandas as pd
import numpy as np
import logging
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel
import math
import os
from argparse import ArgumentParser
import torch
from tokenizers import SentencePieceBPETokenizer
from torch.optim import Adam
from tqdm import tqdm_notebook
from transformers import GPT2LMHeadModel, get_linear_schedule_with_warmup
from korquad_qg.config import QGConfig
from korquad_qg.utils import TqdmLoggingHandler
from typing import List, NamedTuple, Optional, Tuple

ImportError: cannot import name 'PreTrainedTokenizerFast' from 'transformers' (/home/hsoh0423/anaconda3/envs/historyQA/lib/python3.7/site-packages/transformers/__init__.py)

# 데이터 셋 클래스

In [2]:
Q_TKN = '<q>'
A_TKN = '<a>'
BOS = '<s>'
EOS = '</s>'
MASK = '<mask>'
C_TKN = '<c>'
PAD = '<pad>'
TOKENIZER = PreTrainedTokenizerFast(tokenizer_file='Tokenizer/tokenizer.json',
            bos_token=BOS, eos_token=EOS, unk_token='<unk>', 
            pad_token=PAD, mask_token=MASK)

In [3]:
GPTDecodingInputType = Tuple[torch.Tensor, torch.Tensor]
GPTInputsType = Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
GPTFeaturesType = Tuple[List[int], List[float], List[int]]

class QAExample(NamedTuple):
    context: str
    answer: str
    question: Optional[str] = None

def load_korquad_dataset(dataset_path: str) -> List[QAExample]:
    korquad = [pd.read_csv(dataset_path)]
    max_len = 512
    examples = []
    for document in korquad:
        for i in tqdm_notebook(range(len(document))):
            if len(TOKENIZER.tokenize(document["context"][i])) + 10 <= max_len:
                example = QAExample(document["context"][i], document["answers"][i], document["question"][i])
                examples.append(example)
        
    return examples
    
def dynamic_padding_collate_fn(features: List[GPTFeaturesType]) -> GPTInputsType:
    max_seq_len = max([len(feature[0]) for feature in features])
    input_ids, attention_mask, labels = [], [], []

    for feature in features:
        padded_input_ids = feature[0] + [0] * (max_seq_len - len(feature[0]))
        padded_attention_mask = feature[1] + [0.0] * (max_seq_len - len(feature[1]))
        padded_labels = feature[2] + [-100] * (max_seq_len - len(feature[2]))

        input_ids.append(padded_input_ids)
        attention_mask.append(padded_attention_mask)
        labels.append(padded_labels)

    return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(labels)

In [4]:
class HistoryQGDataset(Dataset):
    def __init__(
        self,
        examples: List[QAExample],
        tokenizer: SentencePieceBPETokenizer,
        max_sequence_length: int,
        is_train: bool = True,
    ) -> None:
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_sequence_length = max_sequence_length

        self.sos_token = tokenizer.convert_tokens_to_ids("<s>")
        self.eos_token = tokenizer.convert_tokens_to_ids("</s>")
        self.question_prefix_tokens = tokenizer.convert_tokens_to_ids('<q>')

        self.is_train = is_train

    def __getitem__(self, index: int) -> GPTFeaturesType:
        example = self.examples[index]

        context_tokens = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(f"<c>{example.context}"))
        answer_tokens = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(f"<a>{example.answer}"))
        question_tokens = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(f"{example.question}"))
        
        # [SOS] + 문맥:CONTEXT + 정답:ANSWER + 질문:
        conditional_tokens_len = 1 + len(context_tokens) + len(answer_tokens) + 1
        # QUESTION + [EOS]
        post_tokens_len = len(question_tokens) + 1

        if conditional_tokens_len + post_tokens_len > self.max_sequence_length:
            available_seq_len = (
                self.max_sequence_length - conditional_tokens_len - post_tokens_len + len(context_tokens)
            )
            context_tokens = context_tokens[:available_seq_len]

        conditional_tokens = [self.sos_token] + context_tokens + answer_tokens + [self.question_prefix_tokens]
        post_tokens = question_tokens + [self.eos_token]
        input_ids = conditional_tokens + post_tokens

        labels = input_ids if self.is_train else ([-100] * len(conditional_tokens)) + post_tokens
        attention_mask = [1.0] * len(input_ids)

        assert len(input_ids) <= self.max_sequence_length

        return input_ids, attention_mask, labels
        #return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(labels)

    def __len__(self) -> int:
        return len(self.examples)


# 퀴즈 생성

In [5]:
class QGDecodingDataset(HistoryQGDataset):
    def __getitem__(self, index: int) -> GPTDecodingInputType:
        example = self.examples[index]

        context_tokens = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(f"<c>{example.context}"))
        answer_tokens = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(f"<a>{example.answer}"))

        # [SOS] + CONTEXT + ANSWER + 정답:
        conditional_tokens_len = 1 + len(context_tokens) + len(answer_tokens) + 1
        # ANSWER_SEQ + [EOS]
        post_tokens_len = MAX_QUESTION_SPACE + 1
        if conditional_tokens_len + post_tokens_len > self.max_sequence_length:
            available_seq_len = (
                self.max_sequence_length - conditional_tokens_len - post_tokens_len + len(context_tokens)
            )
            context_tokens = context_tokens[:available_seq_len]

        input_ids = [self.sos_token] + context_tokens + answer_tokens + [self.question_prefix_tokens]
        attention_mask = [1.0] * len(input_ids)
        return torch.tensor(input_ids), torch.tensor(attention_mask)

In [6]:
from argparse import ArgumentParser

import torch
from tokenizers import SentencePieceBPETokenizer
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import GPT2LMHeadModel, GPT2Config
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel
import json

from korquad_qg.config import QGConfig
from korquad_qg.dataset import MAX_QUESTION_SPACE, MIN_QUESTION_SPACE, QAExample, dynamic_padding_collate_fn

Q_TKN = '<q>'
A_TKN = '<a>'
BOS = '<s>'
EOS = '</s>'
MASK = '<mask>'
C_TKN = '<c>'
PAD = '<pad>'

config = QGConfig()
#args = parser.parse_args()

tokenizer = PreTrainedTokenizerFast(tokenizer_file='Tokenizer/tokenizer.json',
         bos_token=BOS, eos_token=EOS, unk_token='<unk>', 
         pad_token=PAD, mask_token=MASK)
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
model.load_state_dict(torch.load('outputs/gpt2_step_20000.pth'))
#model.load_state_dict(torch.load('outputs/gpt2_step_17000.pth'))

#tokenizer = PreTrainedTokenizerFast(tokenizer_file='Tokenizer/History_Korquad_Tokenizer/history_Korquad_tokenizer.json',
#            bos_token=BOS, eos_token=EOS, unk_token='<unk>', 
#            pad_token=PAD, mask_token=MASK)
#model = GPT2LMHeadModel(config=GPT2Config.from_json_file('config.json'))
#model.load_state_dict(torch.load('outputs/gpt2_step_150000.pth', map_location="cpu"))


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

examples = []

with open('data/as_set.json') as f:
    data = json.load(f)
count = 0
for i in tqdm(data['data']):
    example = QAExample(i['doc'], i['answer'])
    examples.append(example)
    count += 1
    if count == 20:
        break
dataset = QGDecodingDataset(examples, tokenizer, 512)
dataloader = torch.utils.data.DataLoader(dataset, 1)

model = model.to(device)
model.eval()

generated_results = []

for i, batch in tqdm(enumerate(dataloader), desc="generate", total=len(dataloader)):
    input_ids, attention_mask = tuple(v.to(device) for v in batch)
    origin_seq_len = input_ids.size(-1)

    decoded_sequences = model.generate(
        input_ids=input_ids,
        max_length=origin_seq_len + MAX_QUESTION_SPACE,
        min_length=origin_seq_len + MIN_QUESTION_SPACE,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        num_beams=5,
        repetition_penalty=2.0,
        #no_repeat_ngram_size=3
        #num_return_sequences=1
    )

    for decoded_tokens in decoded_sequences.tolist():

        decoded_question_text = tokenizer.decode(decoded_tokens[origin_seq_len:])
        #decoded_question_text = tokenizer.decode(decoded_tokens)
        #print(decoded_question_text)
        decoded_question_text = decoded_question_text.split("</s>")[0].replace("<c>", "")
        generated_results.append(
            (examples[i].context, examples[i].answer, examples[i].question, decoded_question_text)
        )
        
context_list = []
answer_list = []
question_list = []
    
for context, answer, question, generated_question in tqdm(generated_results):
        print(generated_question)
        context_list.append(context)
        answer_list.append(answer) 
        question_list.append(generated_question)
        
generated_data = {
        'context':context_list,
        'answers': answer_list,
        'question':question_list
    }

df = pd.DataFrame(generated_data)
df.to_csv('Generated_test.csv',index=False)

  0%|          | 19/110810 [00:00<00:01, 106539.81it/s]
generate: 100%|██████████| 20/20 [00:04<00:00,  4.22it/s]
100%|██████████| 20/20 [00:00<00:00, 12933.41it/s]


고구려의 장안성 축성의 책임자였던 사람은?
가군의 출신지나 직명으로 볼 수 있는 것은?
고구려 5부의 하나인 계루부의 다른 표기는?
가군의 출신지는?
가군의 출신지는?
신라에서 화엄경의강을 지은 사람은?
가귀가 지은 화엄경의강은?
가귀는 승려인가?
가귀는 어느 왕때 승려인가?
가귀는 승려인가?
가라포고이가 귀화한 사람은?
귀화한 신라인은 누구인가?
가라포고이가 귀화한 시기는?
가라포고이가 귀화한 나라는?
가라포고이가 일본에서 귀화한 것은 무엇인가?
고구려 부흥운동 당시 상부대상을 역임한 귀족은?
고구려 부흥운동을 지휘한 사람은?
고구려 멸망 후 부흥운동을 펼친 나라는?
고구려 부흥운동 당시 상부대상을 역임한 귀족은?
고구려 부흥운동이 시작된 시기는?
