In [48]:
import numpy as np
import pandas as pd
import random

import torch
import numpy as np
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained("klue/bert-base")
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")


In [293]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# 데이터프레임 로드
df = pd.read_csv('open/train.csv')

# 토크나이저 설정
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained("klue/bert-base").to(device)
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

In [316]:
def read_klue(path):
    klue_dict = pd.read_csv(path)

    contexts = []
    questions = []
    answers = []
    
    
    for group in range(klue_dict['id'].count()):
        context = klue_dict['context'][group]
        question = klue_dict['question'][group]
        answer = klue_dict['answer'][group]
        contexts.append(context)
        questions.append(question)
        answers.append(answer)
  

    return contexts, questions, answers

In [327]:
contexts[1].find(answers[1])

27

In [318]:
def add_end_idx(answers, contexts):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        
        answer = answers[i]
        context = contexts[i]
        
        # 정답이 context 내에서 시작하는 위치를 찾음
        start_idx = context.find(answer)
        end_idx = start_idx + len(answer)
        if start_idx == -1:
            # 정답이 context 내에 없는 경우
            start_positions.append(0)
            end_positions.append(0)
        else :
            # 정답이 context 내에 있는 경우
            start_positions.append(start_idx)
            end_positions.append(end_idx-1)
    return start_positions, end_positions

            

In [319]:
class KlueDataset(Dataset):
    def __init__(self, contexts, questions, answers,answer_start, answer_end, model_max_position_embedings, tokenizer):
        self.tokenizer = tokenizer
        self.answers = answers
        self.questions = questions
        self.contexts = contexts
        self.answer_start = answer_start
        self.answer_end = answer_end
        self.model_max_position_embedings = model_max_position_embedings
        print("Tokenizing ...")
        self.encodings = self.tokenizer(self.contexts, 
                                        self.questions,
                                        max_length=512,
                                        truncation=True,
                                        padding="max_length",
                                        return_token_type_ids=False)
        print("Done !!!")
        self.add_token_positions()
        
    def add_token_positions(self):
        start_positions = []
        end_positions = []
        for i in range(len(self.answers)):
            start_positions.append(self.encodings.char_to_token(i, self.answer_start[i]))
            end_positions.append(self.encodings.char_to_token(i, self.answer_end[i]))

            # positions 값이 None 값이라면, answer가 포함된 context가 잘렸다는 의미
            if start_positions[-1] is None:
                start_positions[-1] = self.model_max_position_embedings
            if end_positions[-1] is None:
                end_positions[-1] = self.model_max_position_embedings

        self.encodings.update({'start_positions': start_positions, 'end_positions': end_positions})


        
    def get_data(self):
        return {"contexts":self.contexts, 'questions':self.questions, 'answers':self.answers}
    
    
    def get_encodings(self):
        return self.encodings
        
    
    def __getitem__(self, idx):
        return {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
    
    def __len__(self):
        return len(self.encodings['input_ids'])

In [320]:
contexts, questions, answers = read_klue("open/train.csv")
start_positions, end_positions = add_end_idx(answers, contexts)
train_dataset = KlueDataset(contexts, questions, answers,start_positions, end_positions, 512, tokenizer)


Tokenizing ...
Done !!!


In [321]:
EPOCH = 3
LEARNING_RATE = 5e-5
BATCH_SIZE = 8

In [322]:
def train_runner(model, dataset, batch_size, num_train_epochs, learning_rate):
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

    
    model.to(device)
    model.train()
    train_dataloader = DataLoader(dataset=dataset, batch_size=batch_size)
    global_total_step = len(train_dataloader) * num_train_epochs
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0)
    print("TRAIN START")
    with tqdm(total=global_total_step, unit='step') as t:
        total = 0
        total_loss = 0
        for epoch in range(num_train_epochs):
            for batch in train_dataloader:
                optimizer.zero_grad()
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                start_positions = batch['start_positions'].to(device)
                end_positions = batch['end_positions'].to(device)
                outputs = model(input_ids,
                             attention_mask=attention_mask,
                             start_positions=start_positions,
                             end_positions=end_positions)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                
                batch_loss = loss.item() * len(input_ids)
                total += len(input_ids)
                total_loss += batch_loss
                global_total_step += 1
                t.set_postfix(loss="{:.6f}".format(total_loss / total), batch_loss="{:.6f}".format(batch_loss))
                t.update(1)
                
                del input_ids
                del attention_mask
                del start_positions
                del end_positions
                del outputs
                del loss
    model.save_pretrained("./klue_output_model")
    print("TRAIN END")

In [328]:
train_runner(model,train_dataset, BATCH_SIZE, EPOCH, LEARNING_RATE)

Exception ignored in: <function tqdm.__del__ at 0x1069dbee0>
Traceback (most recent call last):
  File "/Users/gifzif4/Desktop/code/.venv/lib/python3.9/site-packages/tqdm/std.py", line 1148, in __del__
    self.close()
  File "/Users/gifzif4/Desktop/code/.venv/lib/python3.9/site-packages/tqdm/notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


TRAIN START


ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html