In [None]:
!set -x \
&& pip install konlpy \
&& curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh | bash -x
!pip install git+https://github.com/ssut/py-hanspell
!pip install transformers datasets wandb sentencepiece

In [None]:
import os
from google.colab import drive

drive.mount("/content/drive")
os.chdir("/content/drive/MyDrive/NLP_Project_2")

os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

fine_tuned_model_name = "write_your_model_name"

In [None]:
!wandb login

import wandb
wandb.init(project = "Goorm_2nd_project", entity = "2nd_group", name = fine_tuned_model_name)

In [None]:
import pandas as pd
import json
import torch
import datasets
import numpy as np
import random
import nltk
import re
import torch.nn.functional as F
import utils
import seaborn as sns

from glob import glob
from konlpy.tag import Mecab
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, DataCollatorWithPadding, get_cosine_schedule_with_warmup
from transformers import AutoModelForQuestionAnswering
from collections import defaultdict, deque
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

SEED = 20220803
BACKBONE = "kykim/bert-kor-base"

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
random.seed(SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if "cuda" in device.type :
    torch.cuda.set_device(device)
print(device)

tokenizer = AutoTokenizer.from_pretrained(BACKBONE, do_lower_case = False)
tagger = Mecab()

In [None]:
original_train = utils.load_data("./RawData/train.json", do_preprocessing = True)

temp = []
for uid in tqdm(original_train.guid.unique()) :
    candidate = original_train.loc[original_train.guid == uid, :]
    candidate = candidate.sort_values("answer", key = lambda x : x.str.len(), ascending = False)
    temp.append(candidate.reset_index(drop = True).loc[0, :])
original_train = pd.DataFrame(temp).reset_index(drop = True)

original_train.loc[:, "source"] = "kaggle"
original_train = original_train.drop("guid", axis = "columns")

In [None]:
batch_size = 16
collator = DataCollatorWithPadding(tokenizer, return_tensors = "pt")

train_pd, valid_pd = train_test_split(original_train, random_state = SEED, test_size = .3)

train_data = utils.get_dataset(train_pd.reset_index(drop = True), tokenizer, collator, batch_size, True)
valid_data = utils.get_dataset(valid_pd.reset_index(drop = True), tokenizer, collator, batch_size * 2, True)

In [None]:
learning_rate = 1e-5
epochs = 3

model = AutoModelForQuestionAnswering.from_pretrained(BACKBONE)
model.train()
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate, eps = 1e-6, weight_decay = 0.02)

lr_scheduler = get_cosine_schedule_with_warmup(optimizer = optimizer,
                                               num_warmup_steps = int(len(train_data) * epochs * 0.06),
                                               num_training_steps = len(train_data) * epochs)

In [11]:
wandb_config = {
    "learning_rate" : learning_rate,
    "batch_size" : batch_size,
    "backbone" : BACKBONE,
    "epochs" : epochs
}

wandb.config.update(wandb_config)

In [None]:
scaler = torch.cuda.amp.GradScaler()

wandb.watch(model, log = "all", log_freq = 10)

for epoch in range(epochs) :
    cum_loss = deque(maxlen = 20)
    cum_dist = deque(maxlen = 20)
    cum_start_acc = deque(maxlen = 20)
    cum_end_acc = deque(maxlen = 20)

    curr_loss = []
    curr_dist = []
    curr_start_acc = []
    curr_end_acc = []

    with tqdm(train_data, unit = " batch") as tepoch :
        curr_loss.clear()
        model.train()
        for i, batch in enumerate(tepoch) :
            optimizer.zero_grad()
            tepoch.set_description(f"Train Epoch {epoch}")

            g_answer = batch["golden_answer"]
            batch = {k : v.to(device) for k, v in batch.items() if k != "golden_answer"}            

            with torch.cuda.amp.autocast() :
                outputs = model(**batch)
                start_logits = outputs["start_logits"]
                end_logits = outputs["end_logits"]
                loss = utils.weighted_loss_fn(start_logits, end_logits, batch["start_positions"], batch["end_positions"], .3, .7)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            lr_scheduler.step()

            cum_loss.append(float(loss))
            curr_loss.append(float(loss))

            start_acc, end_acc = utils.extract_accuracy(start_logits, end_logits, batch["start_positions"], batch["end_positions"])
            cum_start_acc.append(float(start_acc))
            cum_end_acc.append(float(end_acc))
            curr_start_acc.append(float(start_acc))
            curr_end_acc.append(float(end_acc))

            dist = utils.levenshtein_distance(start_logits, end_logits, g_answer, batch["input_ids"], tokenizer, tagger, threshold = 0)
            cum_dist.append(float(dist))
            curr_dist.append(float(dist))

            tepoch.set_postfix(loss = sum(cum_loss) / len(cum_loss),
                               dist = sum(cum_dist) / len(cum_dist),
                               start_acc = sum(cum_start_acc) / len(cum_start_acc),
                               end_acc = sum(cum_end_acc) / len(cum_end_acc))
            
            wandb.log({"train_loss" : sum(cum_loss) / len(cum_loss),
                       "train_dist" : sum(cum_dist) / len(cum_dist),
                       "train_start_acc" : sum(cum_start_acc) / len(cum_start_acc),
                       "train_end_acc" : sum(cum_end_acc) / len(cum_end_acc),
                       "lr" : optimizer.state_dict()["param_groups"][0]['lr'],
                       "train_step" : i + (len(train_data) * epoch)})
            
        print("Train loss : ", sum(curr_loss) / len(curr_loss))
        print("Train dist : ", sum(curr_dist) / len(curr_dist))
        print("Train start acc :", sum(curr_start_acc) / len(curr_start_acc))
        print("Train end acc :", sum(curr_end_acc) / len(curr_end_acc))

    curr_loss.clear()
    curr_dist.clear()
    curr_start_acc.clear()
    curr_end_acc.clear()

    cum_loss.clear()
    cum_dist.clear()
    cum_start_acc.clear()
    cum_end_acc.clear()
    
    with tqdm(valid_data, unit = " batch") as tepoch :
        model.eval()
        with torch.no_grad() :
            for i, batch in enumerate(tepoch) :
                tepoch.set_description(f"Valid Epoch {epoch}")

                g_answer = batch["golden_answer"]
                batch = {k : v.to(device) for k, v in batch.items() if k != "golden_answer"}            

                with torch.cuda.amp.autocast() :
                    outputs = model(**batch)
                    start_logits = outputs["start_logits"]
                    end_logits = outputs["end_logits"]
                    loss = utils.weighted_loss_fn(start_logits, end_logits, batch["start_positions"], batch["end_positions"], .3, .7)

                cum_loss.append(float(loss))
                curr_loss.append(float(loss))

                start_acc, end_acc = utils.extract_accuracy(start_logits, end_logits, batch["start_positions"], batch["end_positions"])
                cum_start_acc.append(float(start_acc))
                cum_end_acc.append(float(end_acc))
                curr_start_acc.append(float(start_acc))
                curr_end_acc.append(float(end_acc))

                dist = utils.levenshtein_distance(start_logits, end_logits, g_answer, batch["input_ids"], tokenizer, tagger, threshold = 0)
                cum_dist.append(float(dist))
                curr_dist.append(float(dist))

                tepoch.set_postfix(loss = sum(cum_loss) / len(cum_loss),
                                dist = sum(cum_dist) / len(cum_dist),
                               start_acc = sum(cum_start_acc) / len(cum_start_acc),
                               end_acc = sum(cum_end_acc) / len(cum_end_acc))
                
    wandb.log({"valid_loss" : sum(curr_loss) / len(curr_loss),
                "valid_dist" : sum(curr_dist) / len(curr_dist),
                "valid_start_acc" : sum(curr_start_acc) / len(curr_start_acc),
                "valid_end_acc" : sum(curr_end_acc) / len(curr_end_acc),
                "valid_step" : epoch + 1})

    print("Valid loss : ", sum(curr_loss) / len(curr_loss))
    print("Valid dist : ", sum(curr_dist) / len(curr_dist))
    print("Valid start acc :", sum(curr_start_acc) / len(curr_start_acc))
    print("Valid end acc :", sum(curr_end_acc) / len(curr_end_acc))

In [None]:
model.save_pretrained("./Model/" + fine_tuned_model_name)

In [None]:
def get_scores_in_dataset(model, input_data, tokenizer, tagger, golden_answer = False) :
    inferenced = []
    scores = []
    answers = []

    with tqdm(input_data, unit = " batch") as tepoch :
        model.eval()
        with torch.no_grad() :
            for i, batch in enumerate(tepoch) :
                tepoch.set_description(f"Score")
                if golden_answer :
                    g_answer = batch["golden_answer"]
                batch = {k : v.to(device) for k, v in batch.items() if k != "golden_answer"}            

                with torch.cuda.amp.autocast() :
                    outputs = model(**batch)
                    start_logits = outputs["start_logits"]
                    end_logits = outputs["end_logits"]
                
                for idx, v in enumerate(batch["input_ids"]) :
                    infer, score = utils.inference(start_logits[idx], end_logits[idx], v, 20, tokenizer, tagger)
                    inferenced.append(infer)
                    scores.append(score)
                    if golden_answer :
                        answers.append(tokenizer.decode(g_answer[idx], skip_special_tokens = True))
    return inferenced, scores, answers

In [None]:
train_inferenced, train_scores, train_answers = get_scores_in_dataset(model, train_data, tokenizer, tagger, True)

In [None]:
compare = pd.DataFrame({"original_inference" : train_inferenced,
                        "answer" : train_answers})

compare.loc[:, "cleaned_inference"] = compare.original_inference.apply(lambda x : utils.remove_postposition(x))

before = []
after = []
for i in range(len(compare)) :
    before.append(nltk.edit_distance(compare.loc[i, "original_inference"], compare.loc[i,"answer"]))
    after.append(nltk.edit_distance(compare.loc[i,"cleaned_inference"], compare.loc[i, "answer"]))
print("수정 전 평균 편집거리 :", sum(before) / len(before))
print("수정 후 평균 편집거리 :", sum(after) / len(after))

In [None]:
valid_inferenced, valid_scores, valid_answers = get_scores_in_dataset(model, valid_data, tokenizer, tagger, True)

In [None]:
sns.distplot(torch.tensor(valid_scores))

In [None]:
submission = pd.read_csv("./RawData/baseline.csv")
submission.loc[:, "Predicted"] = inferenced
submission.to_csv("./Submission/Submission_4.csv", index = False)