In [1]:
import torch
from transformers import AutoModelForQuestionAnswering, BertForQuestionAnswering
from transformers import BertTokenizer
from tokenization_kobert import KoBertTokenizer
import pandas as pd
import csv
from tqdm import tqdm_notebook

In [2]:
# DEFAULT_PATH = '/home/fhdufhdu/vscode/Project/data/models/model_bert'
# DEFAULT_PATH = '/home/fhdufhdu/vscode/Project/data/models/finetunedModel'
DEFAULT_PATH = '/home/fhdufhdu/vscode/Project/KoreanHistoryProject/qa_model/models'


class KoHisQnA:
    def __init__(self, model_path=DEFAULT_PATH, tokenizer_path=DEFAULT_PATH):
        self.change_path(model_path, tokenizer_path)

    def change_path(self, model_path, tokenizer_path) -> None:
        self.model_path = model_path
        self.tokenizer_path = tokenizer_path

        try:
            del self.model, self.tokenizer
        except:
            print('...initailizing...')

        self.model = BertForQuestionAnswering.from_pretrained(self.model_path)
        self.tokenizer = BertTokenizer.from_pretrained(self.tokenizer_path)

    def do_ask_to_model(self, question, context, add_special_tokens=True, return_tensors='pt') -> tuple:
        inputs = self.tokenizer.encode_plus(
            question, context, add_special_tokens=add_special_tokens, return_tensors=return_tensors)

        # 모델에 데이터 집어넣기
        answer_start_vector, answer_end_vector = self.model(**inputs)
        as_idx = torch.argmax(answer_start_vector)
        ae_idx = torch.argmax(answer_end_vector) + 1

        # 정답을 구하기 위한 과기
        input_ids = inputs["input_ids"].tolist()[0]
        text_tokens = self.tokenizer.convert_ids_to_tokens(input_ids)
        answer = self.tokenizer.convert_tokens_to_string(
            text_tokens[as_idx:ae_idx])
        #print(text_tokens[as_idx:ae_idx])

        return (int(as_idx), int(ae_idx), answer)

# qa = KoHisQnA()
# print(qa.do_ask_to_model('조선시대 4대 왕은?', '세종은 조선전기 제4대 왕. 세종은 재위 1418∼1450. 본관은 전주. 이름은 이도, 자는 원정. 태종의 셋째아들이며, 어머니는 원경왕후 민씨이다. 비는 심온의 딸 소헌왕후이다.1408년 충녕군에 봉해지고, 1412년 충녕대군에 진봉되었으며, 1418년 6월 왕세자에 책봉되었다가 같은 해 8월에 태종의 양위를 받아 즉위하였다.'))


In [3]:
qa = KoHisQnA()

generated_quiz= pd.read_csv('/home/hsoh0423/vscode/HistoryQA/Generated_quiz_V3.csv')

context = []
question = []
start_index = []
last_index = []
answers = []

count = 0
except_count = 0
correct_count = 0

print("total data num: ", len(generated_quiz['question']))
for i in tqdm_notebook(range(len(generated_quiz['question'])))      :
    try:
        answer = qa.do_ask_to_model(generated_quiz['question'][i], generated_quiz['context'][i])
        if(answer[2] == generated_quiz['answers'][i]):
            context.append(generated_quiz['context'][i])
            question.append(generated_quiz['question'][i])
            start_index.append(answer[0])
            last_index.append(answer[1])
            answers.append(answer[2])
            correct_count += 1
    except:
            except_count += 1
    count += 1
    if(count % 1000 == 0):
        print("Correct count: ", correct_count, "Except count: ", except_count)

data = {
    'context': context,
    'question': question,
    'start_index': start_index,
    'last_index': last_index,
    'answers': answers
}
df = pd.DataFrame(data)
df.to_csv('Quiz_Data_Set.csv',index=False)
print("done")

...initailizing...
total data num:  60501


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()


  0%|          | 0/60501 [00:00<?, ?it/s]

Correct count:  92 Except count:  250
Correct count:  175 Except count:  434
Correct count:  273 Except count:  620
Correct count:  375 Except count:  858
Correct count:  446 Except count:  1116
Correct count:  528 Except count:  1392
Correct count:  636 Except count:  1612
Correct count:  725 Except count:  1867
Correct count:  807 Except count:  2103
Correct count:  895 Except count:  2374
Correct count:  985 Except count:  2624
Correct count:  1073 Except count:  2862
Correct count:  1150 Except count:  3048
Correct count:  1244 Except count:  3249
Correct count:  1346 Except count:  3433
Correct count:  1460 Except count:  3592
Correct count:  1543 Except count:  3797
Correct count:  1657 Except count:  3989
Correct count:  1754 Except count:  4205
Correct count:  1845 Except count:  4376
Correct count:  1940 Except count:  4552
Correct count:  2038 Except count:  4726
Correct count:  2135 Except count:  4919
Correct count:  2241 Except count:  5108
Correct count:  2309 Except coun