In [2]:
from torch.utils.data import DataLoader, Dataset
from tokenizers import SentencePieceBPETokenizer
import json
import pandas as pd
import numpy as np
import logging
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel
import logging
import math
import os
from argparse import ArgumentParser
import torch
from tokenizers import SentencePieceBPETokenizer
from torch.optim import Adam
from tqdm import tqdm_notebook
from transformers import GPT2LMHeadModel, get_linear_schedule_with_warmup
from korquad_qg.config import QGConfig
from korquad_qg.utils import TqdmLoggingHandler
from typing import List, NamedTuple, Optional, Tuple

# 코쿼드 데이터셋을 이용한 GPT2 데이터셋 생성

In [None]:
with open('korquad.json', 'r') as f:

    json_data = json.load(f)

context_list = []
answers_list = []
question_list = []

for data in json_data['data']:
    for sub_data in data['paragraphs']:
        context = sub_data['context']
        for qa in sub_data['qas']:
            context_list.append(context)
            answers_list.append(qa['answers'][0]['text'])
            question_list.append(qa['question'])
            if len(qa['answers']) > 1:
                print(qa['answers'])
                
data = {
        'context':context_list,
        'answers': answers_list,
        'question':question_list
    }
df = pd.DataFrame(data)
df.to_csv('KorQuad_train_V1.csv',index=False)

# 토크나이저 생성

In [None]:
txt_f = open("sentences.txt", 'r')
all_dict = []
max_len = 0
count = 0
for line in tqdm_notebook(txt_f.readlines()):
    if '\n' in line:
        line = line.replace('\n', '')
    if '##' in line:
        line = line.replace('##', '')
    if len(line) > max_len:
        max_len = len(line)
    all_dict.append(line)
    count += 1
print(max_len)


f = open("data_save_test.txt", 'w')
for i in all_dict:
    f.write(i+'\n')
f.close()

# Initialize a tokenizer
tokenizer = SentencePieceBPETokenizer()
# Then train it!

tokenizer.train_from_iterator([all_dict], vocab_size=300000, min_frequency=1, limit_alphabet=100000)
tokenizer.save("./history_tokenizer.json",pretty=True)
tokenizer.save_model(directory='./')

In [None]:
txt_f = open("sentences.txt", 'r')
max_len = 512
count = 0
tok = PreTrainedTokenizerFast(tokenizer_file='Tokenizer/History/history_tokenizer.json')
for line in tqdm_notebook(txt_f.readlines()):
    if '\n' in line:
        line = line.replace('\n', '')
    if '##' in line:
        line = line.replace('##', '')
    if len(tok.tokenize(line)) + 7 > max_len:
        max_len = len(line)
        count += 1
print(max_len, count)

# 데이터셋 생성

In [3]:
Q_TKN = '<q>'
A_TKN = '<a>'
BOS = '<s>'
EOS = '</s>'
MASK = '<mask>'
C_TKN = '<c>'
PAD = '<pad>'
TOKENIZER = PreTrainedTokenizerFast(tokenizer_file='Tokenizer/tokenizer.json',
            bos_token=BOS, eos_token=EOS, unk_token='<unk>', 
            pad_token=PAD, mask_token=MASK)

# 데이터 셋 클래스

In [4]:
GPTDecodingInputType = Tuple[torch.Tensor, torch.Tensor]
GPTInputsType = Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
GPTFeaturesType = Tuple[List[int], List[float], List[int]]

class QAExample(NamedTuple):
    context: str
    answer: str
    question: Optional[str] = None

def load_korquad_dataset(dataset_path: str) -> List[QAExample]:
    korquad = [pd.read_csv(dataset_path)]
    max_len = 512
    examples = []
    for document in korquad:
        for i in tqdm_notebook(range(len(document))):
            if len(TOKENIZER.tokenize(document["context"][i])) + 10 <= max_len:
                example = QAExample(document["context"][i], document["answers"][i], document["question"][i])
                examples.append(example)
        
    return examples
    
def dynamic_padding_collate_fn(features: List[GPTFeaturesType]) -> GPTInputsType:
    max_seq_len = max([len(feature[0]) for feature in features])
    input_ids, attention_mask, labels = [], [], []

    for feature in features:
        padded_input_ids = feature[0] + [0] * (max_seq_len - len(feature[0]))
        padded_attention_mask = feature[1] + [0.0] * (max_seq_len - len(feature[1]))
        padded_labels = feature[2] + [-100] * (max_seq_len - len(feature[2]))

        input_ids.append(padded_input_ids)
        attention_mask.append(padded_attention_mask)
        labels.append(padded_labels)

    return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(labels)

In [5]:
class HistoryQGDataset(Dataset):
    def __init__(
        self,
        examples: List[QAExample],
        tokenizer: SentencePieceBPETokenizer,
        max_sequence_length: int,
        is_train: bool = True,
    ) -> None:
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_sequence_length = max_sequence_length

        self.sos_token = tokenizer.convert_tokens_to_ids("<s>")
        self.eos_token = tokenizer.convert_tokens_to_ids("</s>")
        self.question_prefix_tokens = tokenizer.convert_tokens_to_ids('<q>')

        self.is_train = is_train

    def __getitem__(self, index: int) -> GPTFeaturesType:
        example = self.examples[index]

        context_tokens = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(f"<c>{example.context}"))
        answer_tokens = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(f"<a>{example.answer}"))
        question_tokens = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(f"{example.question}"))
        
        # [SOS] + 문맥:CONTEXT + 정답:ANSWER + 질문:
        conditional_tokens_len = 1 + len(context_tokens) + len(answer_tokens) + 1
        # QUESTION + [EOS]
        post_tokens_len = len(question_tokens) + 1

        if conditional_tokens_len + post_tokens_len > self.max_sequence_length:
            available_seq_len = (
                self.max_sequence_length - conditional_tokens_len - post_tokens_len + len(context_tokens)
            )
            context_tokens = context_tokens[:available_seq_len]

        conditional_tokens = [self.sos_token] + context_tokens + answer_tokens + [self.question_prefix_tokens]
        post_tokens = question_tokens + [self.eos_token]
        input_ids = conditional_tokens + post_tokens

        labels = input_ids if self.is_train else ([-100] * len(conditional_tokens)) + post_tokens
        attention_mask = [1.0] * len(input_ids)

        assert len(input_ids) <= self.max_sequence_length

        return input_ids, attention_mask, labels

    def __len__(self) -> int:
        return len(self.examples)


In [6]:
def _create_logger(output_dir: str):
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter("[%(asctime)s] %(message)s")

    file_handler = logging.FileHandler(os.path.join(output_dir, "train.log"))
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

    handler = TqdmLoggingHandler()
    handler.setFormatter(logging.Formatter("%(asctime)s - %(message)s"))
    logger.addHandler(handler)
    return logger

In [7]:
def _validate(
    model: GPT2LMHeadModel,
    dev_dataloader: DataLoader,
    device: torch.device,
    logger: logging.Logger,
    global_step: int,
):
    model.eval()
    loss_list = []
    for batch_data in tqdm_notebook(dev_dataloader, desc="[EVAL]"):
        with torch.no_grad():
            input_ids, attention_mask, labels = tuple(value.to(device) for value in batch_data)
            model_outputs = model.forward(input_ids, attention_mask=attention_mask, labels=labels, return_dict=True)
            loss_list.append(model_outputs.loss.item())

    mean_loss = np.mean(loss_list)
    logger.info(f"[EVAL] global_step:{global_step} loss:{mean_loss:.4f} perplexity:{math.exp(mean_loss):.4f}")
    model.train()

# 모델 학습

In [None]:
from transformers import GPT2Model, GPT2Config

# Initializing a model from the configuration
model = GPT2LMHeadModel(config=GPT2Config.from_json_file('config.json'))

# Accessing the model configuration
configuration = model.config

In [None]:
model

In [None]:
config = QGConfig()

tokenizer = PreTrainedTokenizerFast(tokenizer_file='Tokenizer/tokenizer.json',
            bos_token=BOS, eos_token=EOS, unk_token='<unk>', 
            pad_token=PAD, mask_token=MASK)

logger = _create_logger(output_dir=config.output_dir)
logger.info("============================")
for key, value in config._asdict().items():
    logger.info(f"{key:30}:{value}")
logger.info("============================")
torch.manual_seed(config.random_seed)

logger.info("loading train dataset")
train_examples = load_korquad_dataset(config.train_dataset)
train_dataset = HistoryQGDataset(train_examples, tokenizer, config.max_sequence_length)
train_dataloader = torch.utils.data.DataLoader(train_dataset, 16, shuffle=True, collate_fn=dynamic_padding_collate_fn)

logger.info("loading dev dataset")
dev_examples = load_korquad_dataset(config.dev_dataset)
dev_dataset = HistoryQGDataset(dev_examples, tokenizer, config.max_sequence_length, is_train=False)
dev_dataloader = DataLoader(dev_dataset, 16, collate_fn=dynamic_padding_collate_fn)

#model 생성
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
print(model.transformer.wte.weight.shape[0], len(tokenizer.vocab))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = Adam(model.parameters(), lr=config.lr)
total_steps = len(train_dataloader) * config.epochs
warmup_steps = int(total_steps * config.warmup_ratio)
scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)

loss_list_between_log_interval = []
for epoch_id in range(config.epochs):
    for step_index, batch_data in tqdm_notebook(
            enumerate(train_dataloader), f"[TRAIN] EP:{epoch_id}", total=len(train_dataloader)
    ):
        global_step = len(train_dataloader) * epoch_id + step_index + 1
        optimizer.zero_grad()

        token_ids, attention_mask, labels = tuple(value.to(device) for value in batch_data)
        model_outputs = model.forward(token_ids, attention_mask=attention_mask, labels=labels, return_dict=True)
        model_outputs.loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip)
        optimizer.step()
        scheduler.step()

        # for logging
        loss_list_between_log_interval.append(model_outputs.loss.item())

        if global_step % config.train_log_interval == 0:
            mean_loss = np.mean(loss_list_between_log_interval)
            logger.info(
                    f"EP:{epoch_id} global_step:{global_step} "
                    f"loss:{mean_loss:.4f} perplexity:{math.exp(mean_loss):.4f}"
                )
            loss_list_between_log_interval.clear()
            
        if global_step % config.validation_interval == 0:
                _validate(model, dev_dataloader, device, logger, global_step)
                
        if global_step % config.save_interval == 0:
            state_dict = model.state_dict()
            model_path = os.path.join(config.output_dir, f"gpt2_step_{global_step}.pth")
            torch.save(state_dict, model_path)

In [8]:
class QGDecodingDataset(HistoryQGDataset):
    def __getitem__(self, index: int) -> GPTDecodingInputType:
        example = self.examples[index]

        context_tokens = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(f"<c>{example.context}"))
        answer_tokens = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(f"<a>{example.answer}"))

        # [SOS] + CONTEXT + ANSWER + 정답:
        conditional_tokens_len = 1 + len(context_tokens) + len(answer_tokens) + 1
        # ANSWER_SEQ + [EOS]
        post_tokens_len = MAX_QUESTION_SPACE + 1
        if conditional_tokens_len + post_tokens_len > self.max_sequence_length:
            available_seq_len = (
                self.max_sequence_length - conditional_tokens_len - post_tokens_len + len(context_tokens)
            )
            context_tokens = context_tokens[:available_seq_len]

        input_ids = [self.sos_token] + context_tokens + answer_tokens + [self.question_prefix_tokens]
        attention_mask = [1.0] * len(input_ids)

        return torch.tensor(input_ids), torch.tensor(attention_mask)

In [11]:
from argparse import ArgumentParser

import torch
from tokenizers import SentencePieceBPETokenizer
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import GPT2LMHeadModel, GPT2Config
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel
import json

from korquad_qg.config import QGConfig
from korquad_qg.dataset import MAX_QUESTION_SPACE, MIN_QUESTION_SPACE, QAExample, dynamic_padding_collate_fn


config = QGConfig()
#args = parser.parse_args()

tokenizer = PreTrainedTokenizerFast(tokenizer_file='Tokenizer/tokenizer.json',
         bos_token=BOS, eos_token=EOS, unk_token='<unk>', 
         pad_token=PAD, mask_token=MASK)
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
model.load_state_dict(torch.load('outputs/V2/gpt2_step_12000.pth', map_location="cpu"))

#tokenizer = PreTrainedTokenizerFast(tokenizer_file='Tokenizer/History_Korquad_Tokenizer/history_Korquad_tokenizer.json',
#            bos_token=BOS, eos_token=EOS, unk_token='<unk>', 
#            pad_token=PAD, mask_token=MASK)
#model = GPT2LMHeadModel(config=GPT2Config.from_json_file('config.json'))
#model.load_state_dict(torch.load('outputs/gpt2_step_150000.pth', map_location="cpu"))


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
Q_TKN = '<q>'
A_TKN = '<a>'
BOS = '<s>'
EOS = '</s>'
MASK = '<mask>'
C_TKN = '<c>'
PAD = '<pad>'

examples = []

with open('as_set.json') as f:
    data = json.load(f)
count = 0
for i in tqdm(data['data']):
    example = QAExample(i['doc'], "")
    examples.append(example)
    count += 1
    if count == 20:
        break
dataset = QGDecodingDataset(examples, tokenizer, 512)
dataloader = torch.utils.data.DataLoader(dataset, 1)

model = model.to(device)
model.eval()

generated_results = []

for i, batch in tqdm(enumerate(dataloader), desc="generate", total=len(dataloader)):
    input_ids, attention_mask = tuple(v.to(device) for v in batch)
    origin_seq_len = input_ids.size(-1)

    decoded_sequences = model.generate(
        input_ids=input_ids,
        max_length=origin_seq_len + MAX_QUESTION_SPACE,
        min_length=origin_seq_len + MIN_QUESTION_SPACE,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        do_sample=True,
        num_beams=4,
        repetition_penalty=1.2,
        no_repeat_ngram_size=3,
        num_return_sequences=3
    )

    for decoded_tokens in decoded_sequences.tolist():

        decoded_question_text = tokenizer.decode(decoded_tokens[origin_seq_len:])
        #decoded_question_text = tokenizer.decode(decoded_tokens)
        #print(decoded_question_text)
        decoded_question_text = decoded_question_text.split("</s>")[0].replace("<c>", "")
        generated_results.append(
            (examples[i].context, examples[i].answer, examples[i].question, decoded_question_text)
        )
with open("article_qg.tsv", "w") as f:
    
    for context, answer, question, generated_question in tqdm(generated_results):
        print(generated_question)
        f.write(f"문맥\t{context}\n")
        f.write(f"답변\t{answer}\n")
        f.write(f"생성된 질문\t{generated_question}\n")
        if question is not None:
             f.write(f"실제 질문\t{question}\n")
        f.write("\n")

  0%|          | 19/110810 [00:00<00:01, 67080.62it/s]
generate: 100%|██████████| 20/20 [00:07<00:00,  2.70it/s]
100%|██████████| 60/60 [00:00<00:00, 11921.28it/s]

가군이 장안성에 도읍을 옮긴 시기는?
가군이 장안성에 도읍을 옮긴 시기는?
가군이 쌓은 성은?
가군이 성내의 성벽축조를 감독한 곳은?
가군이 장안성 내에서 쌓은 성은?
가군이 쌓은 성은?
가군이 장안성을 축성할 때 사용한 관등은?
가군이 쌓은 성벽은 언제 쌓았는가?
가군이 장안성 축조시에 무엇을 하였나?
가군이 장안성 축조 시 사용한 관등은?
가군이 장안성에 쌓은 성벽은?
가군이 성 안에 세운 관등은?
가군이 쌓은 성문은?
가군의 소속은?
가군이 장안성 축조시 사용한 관등은?
가귀는 화엄경의강과 심원장은 어디인가?
가귀가 화엄경에서 화엄경을 알았던 것은?
가귀가 화엄경에서 화엄경의, 심원장 등을 지은 것은?
가귀가 화엄경의강에서 화엄경을 해설한 책은?
가귀가 화엄경의강에서 화엄경을 알았던 것은?
가귀는 화엄경의강을 저술한 승려인가?
가귀가 화엄경의강을 지은 것은?
가귀가 화엄경에서 화엄경을 해설한 책은?
가귀가 화엄경의, 화엄경의, 심원장은 어디인가?
가귀는 화엄경의강을 누구의 제자인가?
가귀가 화엄경의강에서 화엄경을 알았던 것은?
가귀가 화엄경의강에서 무엇을 알았는가?
가귀는 화엄경의강에서 무엇을 알았는가?
가귀가 화엄경의강을 저술한 승려는?
가귀가 화엄경의강에서 화엄경을 해설한 책은?
가야지방 유이민은 어느 지역에 거주하였는가?
가라포고이의 이름은 무엇인가?
가라포고이의 본명은?
가야지역의 유이민으로 이름이 붙여진 나라는?
가야지방 유이민은 어느 지역에 살았는가?
가야지방 유이민의 이름은 무엇인가?
가야지방의 유민으로 이름이 붙여진 나라는?
가야의 유민들 중 가가야로 알려진 사람은?
가야지방 유이민의 이름은 무엇인가?
가야의 유민 중 신라인과 왜 사이에서 정상적인 통교되지 않은 사람은?
가야지방 유이민은 어느 지역에 거주하였는가?
신라에서 귀화한 신라인의 이름은 무엇인가?
가야의 유민으로 알려진 신라인은?
가야지방 유이민의 이름은 무엇인가?
백제 유민인 가라포고이의 이름은 무엇인가?
백제 부흥운동 때 유민들이 세운 고구려의 사신의 이름은?
고구려 멸망후 백


