In [None]:
# 필요한 라이브러리를 설치합니다.
# KoNLPy의 Mecab 형태소 분석기는 외부로부터 추가 설치가 필요하기 때문에 pip에서 배포하는 패키지 설치 시간보다 조금 더 오래 걸립니다.

!set -x \
&& pip install konlpy \
&& curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh | bash -x

!pip install transformers datasets wandb

+ pip install konlpy
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
+ bash -x
+ curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh
+ mecab_dicdir=/usr/local/lib/mecab/dic/mecab-ko-dic
+ set -e
++ uname
+ os=Linux
+ [[ ! Linux == \L\i\n\u\x ]]
+ hash sudo
+ sudo=sudo
+ python=python3
+ hash pyenv
+ at_user_site=
++ check_python_site_location_is_writable
++ python3 -
+ [[ 1 == \0 ]]
+ hash automake
+ hash mecab
+ echo 'mecab-ko is already installed'
mecab-ko is already installed
+ [[ -d /usr/local/lib/mecab/dic/mecab-ko-dic ]]
+ echo 'mecab-ko-dic is already installed'
mecab-ko-dic is already installed
++ python3 -c 'import pkgutil; print(1 if pkgutil.find_loader("MeCab") else 0)'
+ [[ 1 == \1 ]]
+ echo 'mecab-python is already installed'
mecab-python is already installed
+ echo Done.
Done.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
from google.colab import drive

drive.mount("/content/drive")
os.chdir("/content/drive/MyDrive/WandB_Sweep_test")

os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

fine_tuned_model_name = "Context_Sampling_Jungmin_v1.4"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!wandb login

import wandb

[34m[1mwandb[0m: Currently logged in as: [33mjmp0813[0m ([33m2nd_group[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
import pandas as pd
import json
import torch
import datasets
import numpy as np
import random
import nltk
import re
import torch.nn.functional as F
import utils

from glob import glob
from konlpy.tag import Mecab
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, DataCollatorWithPadding, get_cosine_schedule_with_warmup
from transformers import AutoModelForQuestionAnswering
from collections import defaultdict, deque
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

SEED = 20220803

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
random.seed(SEED)

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if "cuda" in DEVICE.type :
    torch.cuda.set_device(DEVICE)
print(DEVICE)

tagger = Mecab()

cuda:0


In [None]:
# Sweep을 돌리기 위해 저희가 가지고 있었던 모든 내용들을 전부 메소드로 만듭니다.
# 훈련 루프를 포함해 데이터를 가져오는 것 부터 전부 메소드로 만들었습니다.

In [None]:
# 탐색할 하이퍼 파라미터 또는 인자들에 대한 정의입니다.
# sweep은 아래 인자들을 돌아다니며 여러 모델을 학습시킵니다.

sweep_config = {
    "name" : "bert_sweep", # sweep에 출력될 이름입니다.
        'metric' : {
        'name': 'loss',
        'goal': 'minimize'   
        },
    "method" : "random",    # 탐색 방법입니다. grid나 bayes 등의 옵션도 존재합니다.
    "parameters" : {
        'optimizer': {
            'values': ['AdamW', 'RMSprop']  # 사용할 옵티마이저의 종류입니다. 다른 옵티마이저도 사용 가능한지 임의로 RMSProp을 넣어뒀습니다. 다른 옵티마이저를 추가하실 분은 아래 sweep_optimizer 함수에 추가하시고 사용하시면 됩니다. 
        },
        "backbone" : {
            "values" : ["kykim/bert-kor-base", "kykim/electra-kor-base"] # 사용할 사전학습 모델 이름들입니다.
        },
        "epochs" : {
        "values" : [3, 5]               # 에포크 크기입니다.
        },
        "use_lr_scheduler" : {
            "values" : [True, False]        # 학습률 스케줄링을 할지 말지 결정합니다.
        },
        "start_logits_loss_weight" : {      # 토큰의 종료 인덱스를 잘 맞추지 못해서 가중 손실함수를 쓰자는 아이디어가 있었습니다. 가중치 탐색을 위한 인자입니다.
            "min" : 0.1,
            "max" : 0.5
        },
        "lr_scheduler_warmup_ratio" : {     # 만일 학습률 스케줄링을 사용한다면 전체 training step 중 몇 번에 걸쳐 최고 learning rate에 도달할지 결정하는 인자입니다.
            "min" : 0.02,
            "max" : 0.4
        },
        "learning_rate" : {
            "min": 1e-6,
            "max": 1e-4
        },
        "batch_size" : {    # 배치 크기입니다. 16이 넘어가면 OOM이 뜨니 참고해주세요!
            'values' : [8, 16]
        }
    }
}


In [None]:
def run_sweep(config = None):
    """
    sweep_config로부터 얻은 parameter를 이용해 학습하는 함수입니다.
    사실상 저희가 그동안 사용했던 데이터 불러오기~학습까지 모든 과정이 담긴 함수입니다.    
    사전학습 모델마다 tokenizer가 다르기 때문에 sweep이 바뀔때마다 데이터 또한 새로 샘플링해줘야 합니다. (padding value를 추출해야 하기 때문에 collator가 tokenizer를 인자로 받습니다.)
    따라서 sweep을 한 번 돌 때 마다 baseline.ipynb 파일을 새로 실행한다고 보시면 됩니다.
    """

    with wandb.init(config=config) :
        w_config = wandb.config

        # 지금 sweep에서 사용할 사전학습 모델과 tokenizer를 불러옵니다.
        backbone = w_config.backbone
        tokenizer = AutoTokenizer.from_pretrained(backbone, do_lower_case = False)
        collator = DataCollatorWithPadding(tokenizer, return_tensors = "pt")
        model = create_model(backbone).to(DEVICE)


        # 지금 sweep에서 사용할 데이터를 불러옵니다.
        original_train = utils.load_data("./RawData/train.json")
        original_train, rows_to_drop = utils.sampling_context_with_cosine_similarity(original_train, tokenizer, tagger)
        original_train.loc[rows_to_drop, "answer_start"] = 0
        original_train.loc[rows_to_drop, "answer_end"] = 0
        original_train, original_valid = train_test_split(original_train, test_size = 0.3, random_state = SEED)
        train_data, valid_data = sweep_dataset(original_train, original_valid, tokenizer, collator, w_config.batch_size)

        # 지금 sweep에서 사용할 optimizer와 learning rate scheduler를 불러옵니다.
        # 위 config를 수정하지 않으셨다면 경우에 따라 learning rate scheduler를 사용하지 않을 수 있습니다. 따라서 사용하지 않을 경우 None이 저장됩니다.
        optimizer = sweep_optimizer(model, w_config.optimizer, w_config.learning_rate)
        lr_scheduler = sweep_lr_scheduler(optimizer, len(train_data) * w_config.epochs, w_config.lr_scheduler_warmup_ratio) if w_config.use_lr_scheduler else None

        # epoch만큼 훈련 루프와 검증 루프를 돌립니다.
        for epoch in range(w_config.epochs):
            train_loss, train_dist, train_start_acc, train_end_acc = train_epoch(train_data, model, epoch, optimizer, DEVICE, w_config.start_logits_loss_weight, tokenizer, tagger, lr_scheduler)
            valid_loss, valid_dist, valid_start_acc, valid_end_acc = valid_epoch(valid_data, model, epoch, DEVICE, w_config.start_logits_loss_weight, tokenizer)
            wandb.log({"distance": valid_dist,
                       "loss" : valid_loss})

In [1]:
def sweep_dataset(train, valid, tokenizer, collator, batch_size) :
    """
    현재 sweep에서 사용할 훈련 / 검증 데이터를 추출합니다.

    Argument
    train : json으로부터 parsing이 완료된 pandas Dataframe 중 훈련용으로 구분된 데이터를 받습니다.
    valid : json으로부터 parsing이 완료된 pandas Dataframe 중 검증용으로 구분된 데이터를 받습니다.
    tokenizer : 현재 sweep에서 사용하는 사전학습 모델의 tokenizer를 받습니다.
    batch_size : config에 의해 결정된 현재 sweep의 batch_size를 받습니다.
    """
    return utils.get_dataset(train, tokenizer, collator, batch_size, True), utils.get_dataset(valid, tokenizer, collator, batch_size * 2, True)

def sweep_optimizer(input_model, optimizer, learning_rate) :
    """
    현재 sweep에서 사용할 optimizer를 반환합니다.

    Argument
    input_model : 사전학습 모델을 입력받습니다. optimizer와 weight를 연결하기 위해 입력받습니다.
    optimiezr : config에 의해 결정된 optimizer의 이름을 문자열로 입력받습니다.
    learning_rate : config에 의해 결정된 learning_rate를 입력받습니다.
    """
    if optimizer == "AdamW" :
        optimizer = torch.optim.AdamW(input_model.parameters(), lr = learning_rate, eps = 1e-6, weight_decay = 0.02)
    elif optimizer == 'RMSprop' :
        optimizer = torch.optim.RMSprop(input_model.parameters(), lr = learning_rate, weight_decay = 0.02)    
    return optimizer

def sweep_lr_scheduler(optimizer, train_step, warmup_ratio) :
    """
    만일 현재 sweep에서 learning rate scheduler를 사용할 경우 이를 정의하고 반환합니다.

    Argument
    optimizer : 위 sweep_optimizer로부터 얻은 optimizer를 입력받습니다.
    train_step : 전체 훈련 스텝을 입력받습니다. 원본 데이터 크기 / batch_size * epochs가 전체 훈련 스텝 크기입니다.
    warmup_ratio : 전체 훈련 스텝 중 어느 시점까지 learning rate를 증가시킬 것인지 결정합니다.
    """
    lr_scheduler = get_cosine_schedule_with_warmup(optimizer = optimizer,
                                                    num_warmup_steps = int(train_step * warmup_ratio),
                                                    num_training_steps = train_step)
    return lr_scheduler


def create_model(backbone) :
    model = AutoModelForQuestionAnswering.from_pretrained(backbone)
    model.train()
    return model

In [None]:
def train_epoch(train_data, model, epoch, optimizer, device, start_weight, tokenizer, tagger, lr_scheduler = None) :
    """
    1번의 training epoch를 정의한 함수입니다.
    해당 함수는 sweep에 의해 결정된 epoch 횟수만큼 호출됩니다.
    훈련 루프를 함수로 만들었기 때문에 훈련 진행 간 바뀔 수 있는 객체를 인자로 넣어 사용합니다.
    """

    scaler = torch.cuda.amp.GradScaler()

    cum_loss = deque(maxlen = 20)
    cum_dist = deque(maxlen = 20)
    cum_start_acc = deque(maxlen = 20)
    cum_end_acc = deque(maxlen = 20)

    curr_loss = []
    curr_dist = []
    curr_start_acc = []
    curr_end_acc = []

    model.train()

    with tqdm(train_data, unit = " batch") as tepoch :
        curr_loss.clear()
        model.train()
        for i, batch in enumerate(tepoch) :
            optimizer.zero_grad()

            tepoch.set_description(f"Train Epoch {epoch}")
            batch = {k : v.to(device) for k, v in batch.items()}
            
            with torch.cuda.amp.autocast() :
                outputs = model(**batch)
                start_logits = outputs["start_logits"]
                end_logits = outputs["end_logits"]
                loss = utils.weighted_loss_fn(start_logits, end_logits, batch["start_positions"], batch["end_positions"], start_weight, 1 - start_weight)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            if lr_scheduler :
                lr_scheduler.step()

            cum_loss.append(float(loss))
            curr_loss.append(float(loss))

            start_acc, end_acc = utils.extract_accuracy(start_logits, end_logits, batch["start_positions"], batch["end_positions"])
            cum_start_acc.append(float(start_acc))
            cum_end_acc.append(float(end_acc))
            curr_start_acc.append(float(start_acc))
            curr_end_acc.append(float(end_acc))

            dist = utils.levenshtein_distance(start_logits, end_logits, batch["start_positions"], batch["end_positions"], batch["input_ids"], tokenizer, tagger)
            cum_dist.append(float(dist))
            curr_dist.append(float(dist))

            tepoch.set_postfix(loss = sum(cum_loss) / len(cum_loss),
                               dist = sum(cum_dist) / len(cum_dist),
                               start_acc = sum(cum_start_acc) / len(cum_start_acc),
                               end_acc = sum(cum_end_acc) / len(cum_end_acc))
            
            wandb.log({"train_loss" : sum(cum_loss) / len(cum_loss),
                       "train_dist" : sum(cum_dist) / len(cum_dist),
                       "train_start_acc" : sum(cum_start_acc) / len(cum_start_acc),
                       "train_end_acc" : sum(cum_end_acc) / len(cum_end_acc),
                       "lr" : optimizer.state_dict()["param_groups"][0]['lr'],
                       "train_step" : i + (len(train_data) * epoch)})
            
        print("Train loss : ", sum(curr_loss) / len(curr_loss))
        print("Train dist : ", sum(curr_dist) / len(curr_dist))
        print("Train start acc :", sum(curr_start_acc) / len(curr_start_acc))
        print("Train end acc :", sum(curr_end_acc) / len(curr_end_acc))
            
    return sum(curr_loss) / len(curr_loss), sum(curr_dist) / len(curr_dist), sum(curr_start_acc) / len(curr_start_acc), sum(curr_end_acc) / len(curr_end_acc)

def valid_epoch(valid_data, model, epoch, device, start_weight, tokenizer) :
    """
    1번의 validation epoch를 정의한 함수입니다.
    해당 함수는 sweep에 의해 결정된 epoch 횟수만큼 호출됩니다.
    검증 루프를 함수로 만들었기 때문에 검증 진행 간 바뀔 수 있는 객체를 인자로 넣어 사용합니다.
    """

    cum_loss = deque(maxlen = 20)
    cum_dist = deque(maxlen = 20)
    cum_start_acc = deque(maxlen = 20)
    cum_end_acc = deque(maxlen = 20)

    curr_loss = []
    curr_dist = []
    curr_start_acc = []
    curr_end_acc = []

    model.eval()
    with torch.no_grad() :
        with tqdm(valid_data, unit = " batch") as tepoch :
            model.eval()
            with torch.no_grad() :
                for i, batch in enumerate(tepoch) :
                    tepoch.set_description(f"Valid Epoch {epoch}")
                    batch = {k : v.to(device) for k, v in batch.items()}
                    
                    with torch.cuda.amp.autocast() :
                        outputs = model(**batch)
                        start_logits = outputs["start_logits"]
                        end_logits = outputs["end_logits"]
                        loss = utils.weighted_loss_fn(start_logits, end_logits, batch["start_positions"], batch["end_positions"], start_weight, 1 - start_weight)


                    cum_loss.append(float(loss))
                    curr_loss.append(float(loss))

                    start_acc, end_acc = utils.extract_accuracy(start_logits, end_logits, batch["start_positions"], batch["end_positions"])
                    cum_start_acc.append(float(start_acc))
                    cum_end_acc.append(float(end_acc))
                    curr_start_acc.append(float(start_acc))
                    curr_end_acc.append(float(end_acc))

                    dist = utils.levenshtein_distance(start_logits, end_logits, batch["start_positions"], batch["end_positions"], batch["input_ids"], tokenizer, tagger)
                    cum_dist.append(float(dist))
                    curr_dist.append(float(dist))

                    tepoch.set_postfix(loss = sum(cum_loss) / len(cum_loss),
                                    dist = sum(cum_dist) / len(cum_dist),
                                start_acc = sum(cum_start_acc) / len(cum_start_acc),
                                end_acc = sum(cum_end_acc) / len(cum_end_acc))
                    
                    wandb.log({"valid_loss" : sum(cum_loss) / len(cum_loss),
                               "valid_dist" : sum(cum_dist) / len(cum_dist),
                               "valid_start_acc" : sum(cum_start_acc) / len(cum_start_acc),
                               "valid_end_acc" : sum(cum_end_acc) / len(cum_end_acc),
                               "valid_step" : i + (len(valid_data) * epoch)})
                    
        print("Valid loss : ", sum(curr_loss) / len(curr_loss))
        print("Valid dist : ", sum(curr_dist) / len(curr_dist))
        print("Valid start acc :", sum(curr_start_acc) / len(curr_start_acc))
        print("Valid end acc :", sum(curr_end_acc) / len(curr_end_acc))
                
    return sum(curr_loss) / len(curr_loss), sum(curr_dist) / len(curr_dist), sum(curr_start_acc) / len(curr_start_acc), sum(curr_end_acc) / len(curr_end_acc)

In [None]:
sweep_id = wandb.sweep(sweep_config, project = "Goorm_2nd_project", entity = "2nd_group")

Create sweep with ID: sjwplq4l
Sweep URL: https://wandb.ai/2nd_group/Goorm_2nd_project/sweeps/sjwplq4l


In [None]:
wandb.agent(sweep_id, run_sweep, count = 10) # count는 sweep을 실행할 횟수입니다. baseline.ipynb 파일 전체를 실행하는데 약 1시간이 걸렸을 때 count가 10이라면 이 sweep은 10시간동안 돌아갑니다. 참고하셔서 실행해주세요!

[34m[1mwandb[0m: Agent Starting Run: ch4gha3i with config:
[34m[1mwandb[0m: 	backbone: kykim/electra-kor-base
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 9.207582227919752e-05
[34m[1mwandb[0m: 	lr_scheduler_warmup_ratio: 0.047735790354572816
[34m[1mwandb[0m: 	optimizer: AdamW
[34m[1mwandb[0m: 	start_logits_loss_weight: 0.49984319581306424
[34m[1mwandb[0m: 	use_lr_scheduler: True
[34m[1mwandb[0m: Currently logged in as: [33mjmp0813[0m ([33m2nd_group[0m). Use [1m`wandb login --relogin`[0m to force relogin


100%|██████████| 9789/9789 [00:01<00:00, 5193.54it/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (741 > 512). Running this sequence through the model will result in indexing errors


코드 오류로 추정되는 문제로 인해 사용하지 못하는 문장의 비율 : 0.000551808706315144
유사도 기반으로 추출한 문장에 정답 단어가 없는 문장의 비율 : 0.019558553034947884
위 두 문제를 합한 전체 오류 : 0.02011036174126303
실제로 유사도 기반 추출을 한 문장들 중 오류가 있는 문장의 비율: 0.07677902621722846
전체 문장들 중 사용할 수 있는 문장의 비율 : 0.9799509503372165


  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

Some weights of the model checkpoint at kykim/electra-kor-base were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at kykim/electra-kor-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should prob

Train loss :  1.2436754176659244
Train dist :  25.16456582633053
Train start acc : 0.8345588235294118
Train end acc : 0.6115196078431373


Valid Epoch 0: 100%|██████████| 306/306 [01:13<00:00,  4.19 batch/s, dist=5.89, end_acc=0.718, loss=0.61, start_acc=0.914]


Valid loss :  0.5539312957743414
Valid dist :  6.339429361488185
Valid start acc : 0.9175936400500777
Valid end acc : 0.7386877828953313


Train Epoch 1: 100%|██████████| 1428/1428 [09:35<00:00,  2.48 batch/s, dist=5.76, end_acc=0.831, loss=0.304, start_acc=0.956]


Train loss :  0.43315644282978083
Train dist :  4.972689075630252
Train start acc : 0.9341736694677871
Train end acc : 0.7747724089635855


Valid Epoch 1: 100%|██████████| 306/306 [01:14<00:00,  4.11 batch/s, dist=5.87, end_acc=0.766, loss=0.474, start_acc=0.956]


Valid loss :  0.48350653381032105
Valid dist :  3.4001225490196076
Valid start acc : 0.9368872549019608
Valid end acc : 0.7432598039215687


Train Epoch 2: 100%|██████████| 1428/1428 [09:33<00:00,  2.49 batch/s, dist=0.562, end_acc=0.806, loss=0.24, start_acc=0.969]


Train loss :  0.2372851969895321
Train dist :  4.683298319327731
Train start acc : 0.9648109243697479
Train end acc : 0.8328081232492998


Valid Epoch 2: 100%|██████████| 306/306 [01:13<00:00,  4.19 batch/s, dist=5.19, end_acc=0.695, loss=0.443, start_acc=0.953]


Valid loss :  0.48964326753238446
Valid dist :  3.064621040723982
Valid start acc : 0.9385212418300654
Valid end acc : 0.7368338361285092


Train Epoch 3: 100%|██████████| 1428/1428 [09:25<00:00,  2.53 batch/s, dist=8.5, end_acc=0.775, loss=0.266, start_acc=0.95]


Train loss :  0.16947018584749773
Train dist :  5.422356442577031
Train start acc : 0.9772408963585434
Train end acc : 0.8578431372549019


Valid Epoch 3: 100%|██████████| 306/306 [01:14<00:00,  4.13 batch/s, dist=1.78, end_acc=0.72, loss=0.556, start_acc=0.947]


Valid loss :  0.5734261274763864
Valid dist :  3.0191207893413776
Valid start acc : 0.9479166666666666
Valid end acc : 0.7272341629258947


Train Epoch 4: 100%|██████████| 1428/1428 [09:28<00:00,  2.51 batch/s, dist=0.431, end_acc=0.881, loss=0.149, start_acc=0.994]


Train loss :  0.13223100013972072
Train dist :  4.981442577030812
Train start acc : 0.9805672268907563
Train end acc : 0.8759628851540616


Valid Epoch 4: 100%|██████████| 306/306 [01:13<00:00,  4.15 batch/s, dist=1.7, end_acc=0.731, loss=0.605, start_acc=0.944]


Valid loss :  0.6187265165137992
Valid dist :  3.4073497988939168
Valid start acc : 0.9475081699346405
Valid end acc : 0.7283653846753189


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
distance,█▂▁▁▂
loss,▅▁▁▆█
lr,▂▅███████▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁▁
train_dist,█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_end_acc,▁▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇█▇▇▇▇▇███▇██▇▇█▇████████
train_loss,█▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_start_acc,▁▅▆▆▅▅▆▇▇▇▇▆▇▇▇▇█▇▇▇▇████████▇██████████
train_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
valid_dist,▃▅▅▄▅▁▄▃▂▂▂▂▃▂▃▂█▃▁▂▁▂▂▂▁▁▁▄▄▁▁▃▅▁▁▃▂▃▄▂
valid_end_acc,▄▅▅▇▄▆▅▅▄▅▆▄▅▆▂▅█▂▅▆█▆▅▅▅▆▃▁▄▇▇█▅▇▅▄▅▅▃▃

0,1
distance,3.40735
loss,0.61873
lr,0.0
train_dist,0.43125
train_end_acc,0.88125
train_loss,0.14937
train_start_acc,0.99375
train_step,7139.0
valid_dist,1.7012
valid_end_acc,0.73149


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: hl08gifg with config:
[34m[1mwandb[0m: 	backbone: kykim/bert-kor-base
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 9.06432805925748e-05
[34m[1mwandb[0m: 	lr_scheduler_warmup_ratio: 0.3919973157454657
[34m[1mwandb[0m: 	optimizer: AdamW
[34m[1mwandb[0m: 	start_logits_loss_weight: 0.30105374499967086
[34m[1mwandb[0m: 	use_lr_scheduler: False


100%|██████████| 9789/9789 [00:01<00:00, 5218.76it/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (741 > 512). Running this sequence through the model will result in indexing errors


코드 오류로 추정되는 문제로 인해 사용하지 못하는 문장의 비율 : 0.000551808706315144
유사도 기반으로 추출한 문장에 정답 단어가 없는 문장의 비율 : 0.019558553034947884
위 두 문제를 합한 전체 오류 : 0.02011036174126303
실제로 유사도 기반 추출을 한 문장들 중 오류가 있는 문장의 비율: 0.07677902621722846
전체 문장들 중 사용할 수 있는 문장의 비율 : 0.9799509503372165


  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model

Train loss :  0.9834941404673377
Train dist :  12.919292717086835
Train start acc : 0.8477766106442577
Train end acc : 0.6551995798319328


Valid Epoch 0: 100%|██████████| 306/306 [01:32<00:00,  3.32 batch/s, dist=6.79, end_acc=0.738, loss=0.612, start_acc=0.916]


Valid loss :  0.6902968212475185
Valid dist :  5.289184263448969
Valid start acc : 0.9027777777777778
Valid end acc : 0.7184043489640055


Train Epoch 1: 100%|██████████| 1428/1428 [10:17<00:00,  2.31 batch/s, dist=9.91, end_acc=0.731, loss=0.661, start_acc=0.925]


Train loss :  0.5384672802901652
Train dist :  6.278974089635854
Train start acc : 0.9175420168067226
Train end acc : 0.7685574229691877


Valid Epoch 1: 100%|██████████| 306/306 [01:31<00:00,  3.36 batch/s, dist=5.77, end_acc=0.713, loss=0.636, start_acc=0.906]


Valid loss :  0.7168048971813489
Valid dist :  4.8547951231774755
Valid start acc : 0.9021650326797386
Valid end acc : 0.6893539467668222


Train Epoch 2: 100%|██████████| 1428/1428 [10:16<00:00,  2.32 batch/s, dist=3.56, end_acc=0.762, loss=0.53, start_acc=0.894]


Train loss :  0.43048756721061693
Train dist :  3.7401960784313726
Train start acc : 0.9396008403361344
Train end acc : 0.8062850140056023


Valid Epoch 2: 100%|██████████| 306/306 [01:30<00:00,  3.37 batch/s, dist=3.44, end_acc=0.706, loss=0.841, start_acc=0.887]


Valid loss :  0.8049811200281374
Valid dist :  5.161639014580191
Valid start acc : 0.8935394672786489
Valid end acc : 0.7054895676818549


Train Epoch 3: 100%|██████████| 1428/1428 [10:12<00:00,  2.33 batch/s, dist=9.53, end_acc=0.738, loss=0.584, start_acc=0.919]


Train loss :  0.43346549494254616
Train dist :  4.577118347338936
Train start acc : 0.9417892156862745
Train end acc : 0.8053221288515406


Valid Epoch 3: 100%|██████████| 306/306 [01:34<00:00,  3.24 batch/s, dist=4.75, end_acc=0.689, loss=0.864, start_acc=0.895]


Valid loss :  0.9628865306868273
Valid dist :  4.456605077928607
Valid start acc : 0.8813945450424369
Valid end acc : 0.6825194822417365


Train Epoch 4: 100%|██████████| 1428/1428 [10:22<00:00,  2.29 batch/s, dist=0.562, end_acc=0.806, loss=0.334, start_acc=0.975]


Train loss :  0.35774802260152366
Train dist :  2.7727591036414565
Train start acc : 0.9515931372549019
Train end acc : 0.8326330532212886


Valid Epoch 4: 100%|██████████| 306/306 [01:31<00:00,  3.34 batch/s, dist=5.82, end_acc=0.666, loss=0.904, start_acc=0.885]


Valid loss :  0.8118037547843129
Valid dist :  3.8066867772750124
Valid start acc : 0.9036576169378617
Valid end acc : 0.6973196330413319


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
distance,█▆▇▄▁
loss,▁▂▄█▄
lr,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_dist,▇█▄▇▆▂▄▂▄▁▅▃▁▂▁▂▂▂▂▂▁▁▂▂▄▂▂▁▂▂▃▁▁▂▂▂▁▁▂▂
train_end_acc,▁▃▄▃▃▅▅▅▆█▆▆▆▆▅▆▇▆▇██▇▇▆▇█▅▇██▆▇▇▇█▇█▇██
train_loss,█▄▄▄▄▃▃▃▂▁▂▂▂▂▂▂▁▂▁▁▁▂▁▂▁▁▂▂▁▁▂▂▁▂▁▁▁▁▁▁
train_start_acc,▁▄▄▄▅▅▅▆▆▇▇▇▆▇▆▆█▇▇██▇▇▆█▇▇▆▇█▇▇█▆█▇▇█▇▇
train_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
valid_dist,▃▁▃█▂█▃▁▁▁▂▄▆▅▅▄▆▃▃▄▁▅▂▃█▃▅▂▄▃▂▃▁▄▃▃▄▃▁▅
valid_end_acc,▄▆█▃▃▅▇▇▄▂▂▄▃▃▄▄▁▇▅▃▃▆▅▄▄▅▅▅▂▁▁▂▄▃▂▆▄▄▅▂

0,1
distance,3.80669
loss,0.8118
lr,9e-05
train_dist,0.5625
train_end_acc,0.80625
train_loss,0.33417
train_start_acc,0.975
train_step,7139.0
valid_dist,5.81731
valid_end_acc,0.66587


[34m[1mwandb[0m: Agent Starting Run: 715yntrs with config:
[34m[1mwandb[0m: 	backbone: kykim/electra-kor-base
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 2.574669461253035e-05
[34m[1mwandb[0m: 	lr_scheduler_warmup_ratio: 0.14282991168086423
[34m[1mwandb[0m: 	optimizer: RMSprop
[34m[1mwandb[0m: 	start_logits_loss_weight: 0.1547643343736187
[34m[1mwandb[0m: 	use_lr_scheduler: True


100%|██████████| 9789/9789 [00:01<00:00, 5160.03it/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (741 > 512). Running this sequence through the model will result in indexing errors


코드 오류로 추정되는 문제로 인해 사용하지 못하는 문장의 비율 : 0.000551808706315144
유사도 기반으로 추출한 문장에 정답 단어가 없는 문장의 비율 : 0.019558553034947884
위 두 문제를 합한 전체 오류 : 0.02011036174126303
실제로 유사도 기반 추출을 한 문장들 중 오류가 있는 문장의 비율: 0.07677902621722846
전체 문장들 중 사용할 수 있는 문장의 비율 : 0.9799509503372165


  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

Some weights of the model checkpoint at kykim/electra-kor-base were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at kykim/electra-kor-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should prob

Train loss :  2.8849635449752045
Train dist :  79.8529411764706
Train start acc : 0.6189600840336135
Train end acc : 0.42839635854341734


Valid Epoch 0: 100%|██████████| 306/306 [01:33<00:00,  3.28 batch/s, dist=5.42, end_acc=0.756, loss=0.766, start_acc=0.888]


Valid loss :  0.8447097758062525
Valid dist :  10.632337229763701
Valid start acc : 0.8726118653038748
Valid end acc : 0.7110042735642078


Train Epoch 1: 100%|██████████| 1428/1428 [10:07<00:00,  2.35 batch/s, dist=26.6, end_acc=0.525, loss=1.42, start_acc=0.806]


Train loss :  1.1654455481457109
Train dist :  20.20045518207283
Train start acc : 0.8015581232492998
Train end acc : 0.6425070028011205


Valid Epoch 1: 100%|██████████| 306/306 [01:32<00:00,  3.29 batch/s, dist=12.6, end_acc=0.651, loss=1.28, start_acc=0.803]


Valid loss :  1.3042493678775489
Valid dist :  18.247501885369534
Valid start acc : 0.799648064413881
Valid end acc : 0.6138448970380173


Train Epoch 2:  90%|████████▉ | 1281/1428 [09:03<01:12,  2.02 batch/s, dist=24.3, end_acc=0.55, loss=1.64, start_acc=0.675]