# KorQuAD 1.0 활용해서 Roberta-large fine-tuning 하기
-> huggingface에 모델 올려두고 불러와서 사용하기! https://www.youtube.com/watch?v=ovD_87gHZO4

- Method0. CNN layer 추가 
- Method1. KorQuAD 1.0 (train+validation) 데이터셋 만으로 1차 fine-tuning -> ssunbear/klue_roberta_large_finetuned_korquad_v1
- Method2. Method1에 mrc_train 데이터셋으로 한번 더 fine-tuning(모델 재호출) -> ssunbear/klue_roberta_large_finetuned_korquad_v2

In [1]:
!pip install transformers==4.24.0 -q
!pip install huggingface
!pip install datasets==2.14.6
!pip install wandb

[0m

## korquad 데이터셋 불러오기

In [2]:
import pandas as pd

In [3]:
from datasets import load_dataset, concatenate_datasets

dataset = load_dataset('squad_kor_v1')

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
dataset['train']

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 60407
})

In [8]:
dataset['validation']

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 5774
})

## mrc valid 데이터셋 불러오기


In [12]:
from datasets import load_from_disk

# 데이터셋 로드
mrc_train_dataset_path = "/data/ephemeral/home/level2-mrc-nlp-15/data/train_dataset/train"  # 실제 데이터셋 경로로 수정
mrc_train_dataset = load_from_disk(mrc_train_dataset_path)

In [13]:
mrc_train_dataset

Dataset({
    features: ['title', 'context', 'question', 'id', 'answers', 'document_id', '__index_level_0__'],
    num_rows: 3952
})

In [14]:
from datasets import load_from_disk

# 데이터셋 로드
mrc_validation_dataset_path = "/data/ephemeral/home/level2-mrc-nlp-15/data/train_dataset/validation"  # 실제 데이터셋 경로로 수정
mrc_validation_dataset = load_from_disk(mrc_validation_dataset_path)

In [15]:
mrc_validation_dataset


Dataset({
    features: ['title', 'context', 'question', 'id', 'answers', 'document_id', '__index_level_0__'],
    num_rows: 240
})

In [18]:
from datasets import load_from_disk

# 데이터셋 로드
mrc_train_validation_path = "/data/ephemeral/home/level2-mrc-nlp-15/data/train_dataset/validation"  # 실제 데이터셋 경로로 수정
mrc_train_validation = load_from_disk(mrc_train_validation_path)

In [16]:
# korquad 데이터셋이랑 형식 똑같이 만들어주기
id_list = []
title_list = []
context_list = []
question_list = []
answers_list = []

for index, row in pd.DataFrame(mrc_train_dataset).iterrows():
    id_list.append(row['id'])
    title_list.append(str(row['title']))
    context_list.append(str(row['context']))
    question_list.append(str(row['question']))
    answers_list.append(row['answers'])

In [13]:
mrc_train_dataset = {
    "id" : id_list,
    "title" : title_list,
    "context" : context_list,
    "question" : question_list,
    "answers" : answers_list,}

In [19]:
from datasets import Dataset

mrc_train_dataset= Dataset.from_dict(mrc_train_dataset)
mrc_train_dataset

AttributeError: 'Dataset' object has no attribute 'items'

In [63]:
# korquad 데이터셋이랑 형식 똑같이 만들어주기
id_list2 = []
title_list2 = []
context_list2 = []
question_list2 = []
answers_list2 = []

for index, row in pd.DataFrame(mrc_validation_dataset).iterrows():
    id_list2.append(row['id'])
    title_list2.append(str(row['title']))
    context_list2.append(str(row['context']))
    question_list2.append(str(row['question']))
    answers_list2.append(row['answers'])

In [64]:
mrc_validation_dataset = {
    "id" : id_list,
    "title" : title_list,
    "context" : context_list,
    "question" : question_list,
    "answers" : answers_list,}

In [67]:
from datasets import Dataset

mrc_validation_dataset= Dataset.from_dict(mrc_validation_dataset)
mrc_validation_dataset

AttributeError: 'Dataset' object has no attribute 'items'

## korquad 데이터셋 filtering
- Korquad 데이터셋과 train 데이터셋의 context 길이 분포 맞춰주기
- train 데이터셋 context 길이 2064 이하이므로, korquad 데이터셋 중 context 길이가 2064개 이상인 데이터들은 삭제해줍니다.

In [20]:
filtered_dataset = dataset['train'].filter(lambda example: len(example['context']) <= 2064)


In [21]:
filtered_dataset_validation = dataset['validation'].filter(lambda example: len(example['context']) <= 2064)

In [22]:
id_list3 = []
title_list3 = []
context_list3 = []
question_list3 = []
answers_list3 = []

for index, row in pd.DataFrame(filtered_dataset_validation).iterrows():
    id_list3.append(row['id'])
    title_list3.append(str(row['title']))
    context_list3.append(str(row['context']))
    question_list3.append(str(row['question']))
    answers_list3.append(row['answers'])

In [23]:
val_dataset = {
    "id" : id_list3,
    "title" : title_list3,
    "context" : context_list3,
    "question" : question_list3,
    "answers" : answers_list3,}

In [24]:

val_dataset= Dataset.from_dict(val_dataset)
val_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 5735
})

## Pre-trained 모델 불러오기


In [31]:
from transformers import (
    AutoConfig,
    #AutoModelForQuestionAnswering,
    AutoTokenizer
)
from CNN_layer_model import CNN_RobertaForQuestionAnswering

model_name = "klue/roberta-large"

config = AutoConfig.from_pretrained(
    model_name
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True
)
model = CNN_RobertaForQuestionAnswering.from_pretrained(
    model_name,
    config=config)

Some weights of the model checkpoint at klue/roberta-large were not used when initializing CNN_RobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing CNN_RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CNN_RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CNN_RobertaForQuestionAnswering were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['cnn_block2.conv1.weight', 'cnn_block1.layer_norm.weight', 'cnn_block1.conv2.bias', 'cnn_block5.conv1.bias', 'cnn_b

## Korquad 데이터셋 전처리


In [32]:
max_seq_length = 512 # 질문과 컨텍스트, special token을 합한 문자열의 최대 길이 (일정 개수가 넘어가지 않도록!)
pad_to_max_length = False
doc_stride = 128 # 컨텍스트가 너무 길어서 나눴을 때 오버랩되는 시퀀스 길이, 문서 2개로 쪼개고, 128개 시퀀스가 겹치도록
preprocessing_num_workers = None
batch_size = 16
num_train_epochs = 1
n_best_size = 20
max_answer_length = 30

In [33]:
def prepare_train_features(examples): # examples: 데이터셋 row..
    # 주어진 텍스트를 토크나이징 한다. 이 때 텍스트의 길이가 max_seq_length를 넘으면 stride만큼 슬라이딩하며 여러 개로 쪼갬.
    # 즉, 하나의 example에서 일부분이 겹치는 여러 sequence(feature)가 생길 수 있음.
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",  # max_seq_length까지 truncate한다. pair의 두번째 파트(context)만 잘라냄.
        max_length=max_seq_length,
        stride=doc_stride,
        return_overflowing_tokens=True, # 길이를 넘어가는 토큰들을 반환할 것인지
        return_offsets_mapping=True,  # 각 토큰에 대해 (char_start, char_end) 정보를 반환한 것인지
        padding="max_length", return_token_type_ids=False
    )

    # example 하나가 여러 sequence에 대응하는 경우를 위해 매핑이 필요함.
    overflow_to_sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # offset_mappings으로 토큰이 원본 context 내 몇번째 글자부터 몇번째 글자까지 해당하는지 알 수 있음.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # 정답지를 만들기 위한 리스트
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # 해당 example에 해당하는 sequence를 찾음.
        sequence_ids = tokenized_examples.sequence_ids(i)

        # sequence가 속하는 example을 찾는다.
        example_index = overflow_to_sample_mapping[i]
        answers = examples["answers"][example_index]

        # 텍스트에서 answer의 시작점, 끝점
        answer_start_offset = answers["answer_start"][0]
        answer_end_offset = answer_start_offset + len(answers["text"][0])

        # 텍스트에서 현재 span의 시작 토큰 인덱스
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        # 텍스트에서 현재 span 끝 토큰 인덱스
        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        # answer가 현재 span을 벗어났는지 체크
        if not (offsets[token_start_index][0] <= answer_start_offset and offsets[token_end_index][1] >= answer_end_offset):
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # token_start_index와 token_end_index를 answer의 시작점과 끝점으로 옮김
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= answer_start_offset:
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)
            while offsets[token_end_index][1] >= answer_end_offset:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [34]:
column_names = filtered_dataset.column_names
train_dataset = filtered_dataset.map(
            prepare_train_features,
            batched=True,
            num_proc=preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=True,
        )

In [35]:
def prepare_validation_features(examples):
    tokenized_examples = tokenizer(
        examples['question'],
        examples['context'],
        truncation="only_second",
        max_length=max_seq_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1

        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [36]:
eval_dataset = val_dataset.map(
            prepare_validation_features,
            batched=True,
            num_proc=preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=True,
        )

Map: 100%|██████████| 5735/5735 [00:04<00:00, 1429.30 examples/s]


## Question Answering Class 정의


In [37]:
# default_data_collator: 여러개 example들을 collator해주는 역할,
# TrainingArguments : 한번에 training arguments들을 합쳐서 주는..!
from transformers import default_data_collator, TrainingArguments, EvalPrediction

In [38]:
# coding=utf-8
# Copyright 2020 The HuggingFace Team All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Question-Answering task와 관련된 'Trainer'의 subclass 코드 입니다.
"""

from transformers import Trainer, is_datasets_available, is_torch_tpu_available
from transformers.trainer_utils import PredictionOutput

if is_datasets_available():
    import datasets

# Huggingface의 Trainer를 상속받아 QuestionAnswering을 위한 Trainer를 생성합니다.
class QuestionAnsweringTrainer(Trainer):
    def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.eval_examples = eval_examples
        self.post_process_function = post_process_function

    def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None):
        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
        eval_dataloader = self.get_eval_dataloader(eval_dataset)
        eval_examples = self.eval_examples if eval_examples is None else eval_examples

        # 일시적으로 metric computation를 불가능하게 한 상태이며, 해당 코드에서는 loop 내에서 metric 계산을 수행합니다.
        compute_metrics = self.compute_metrics
        self.compute_metrics = None
        try:
            output = self.prediction_loop(
                eval_dataloader,
                description="Evaluation",
                # metric이 없으면 예측값을 모으는 이유가 없으므로 아래의 코드를 따르게 됩니다.
                # self.args.prediction_loss_only
                prediction_loss_only=True if compute_metrics is None else None,
                ignore_keys=ignore_keys,
            )
        finally:
            self.compute_metrics = compute_metrics

        if isinstance(eval_dataset, datasets.Dataset):
            eval_dataset.set_format(
                type=eval_dataset.format["type"],
                columns=list(eval_dataset.features.keys()),
            )

        if self.post_process_function is not None and self.compute_metrics is not None:
            eval_preds = self.post_process_function(
                eval_examples, eval_dataset, output.predictions, self.args
            )
            metrics = self.compute_metrics(eval_preds)

            self.log(metrics)
        else:
            metrics = {}

        self.control = self.callback_handler.on_evaluate(
            self.args, self.state, self.control, metrics
        )
        return metrics

    def predict(self, test_dataset, test_examples, ignore_keys=None):
        test_dataloader = self.get_test_dataloader(test_dataset)

        # 일시적으로 metric computation를 불가능하게 한 상태이며, 해당 코드에서는 loop 내에서 metric 계산을 수행합니다.
        # evaluate 함수와 동일하게 구성되어있습니다
        compute_metrics = self.compute_metrics
        self.compute_metrics = None
        try:
            output = self.prediction_loop(
                test_dataloader,
                description="Evaluation",
                # metric이 없으면 예측값을 모으는 이유가 없으므로 아래의 코드를 따르게 됩니다.
                # self.args.prediction_loss_only
                prediction_loss_only=True if compute_metrics is None else None,
                ignore_keys=ignore_keys,
            )
        finally:
            self.compute_metrics = compute_metrics

        if self.post_process_function is None or self.compute_metrics is None:
            return output

        if isinstance(test_dataset, datasets.Dataset):
            test_dataset.set_format(
                type=test_dataset.format["type"],
                columns=list(test_dataset.features.keys()),
            )

        predictions = self.post_process_function(
            test_examples, test_dataset, output.predictions, self.args
        )
        return predictions


## 후처리 클래스 정의


In [40]:
# coding=utf-8
# Copyright 2020 The HuggingFace Team All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Pre-processing
Post-processing utilities for question answering.
"""
import collections
import json
import logging
import os
import random
from typing import Any, Optional, Tuple

import numpy as np
import torch
from arguments import DataTrainingArguments, ModelArguments
from datasets import DatasetDict
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast, TrainingArguments, is_torch_available
from transformers.trainer_utils import get_last_checkpoint

#from utils.datetime_helper import get_seoul_datetime_str

logger = logging.getLogger(__name__)


def set_seed(seed: int = 2024):
    """
    seed 고정하는 함수 (random, numpy, torch)

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # if use multi-GPU
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


def postprocess_qa_predictions(
    examples,
    features,
    predictions: Tuple[np.ndarray, np.ndarray],
    version_2_with_negative: bool = False,
    n_best_size: int = 20,
    max_answer_length: int = 30,
    null_score_diff_threshold: float = 0.0,
    output_dir: Optional[str] = None,
    prefix: Optional[str] = None,
    is_world_process_zero: bool = True,
):
    """
    Post-processes : qa model의 prediction 값을 후처리하는 함수
    모델은 start logit과 end logit을 반환하기 때문에, 이를 기반으로 original text로 변경하는 후처리가 필요함

    Args:
        examples: 전처리 되지 않은 데이터셋 (see the main script for more information).
        features: 전처리가 진행된 데이터셋 (see the main script for more information).
        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
            모델의 예측값 :start logits과 the end logits을 나타내는 two arrays              첫번째 차원은 :obj:`features`의 element와 갯수가 맞아야함.
        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
            정답이 없는 데이터셋이 포함되어있는지 여부를 나타냄
        n_best_size (:obj:`int`, `optional`, defaults to 20):
            답변을 찾을 때 생성할 n-best prediction 총 개수
        max_answer_length (:obj:`int`, `optional`, defaults to 30):
            생성할 수 있는 답변의 최대 길이
        null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
            null 답변을 선택하는 데 사용되는 threshold
            : if the best answer has a score that is less than the score of
            the null answer minus this threshold, the null answer is selected for this example (note that the score of
            the null answer for an example giving several features is the minimum of the scores for the null answer on
            each feature: all features must be aligned on the fact they `want` to predict a null answer).
            Only useful when :obj:`version_2_with_negative` is :obj:`True`.
        output_dir (:obj:`str`, `optional`):
            아래의 값이 저장되는 경로
            dictionary : predictions, n_best predictions (with their scores and logits) if:obj:`version_2_with_negative=True`,
            dictionary : the scores differences between best and null answers
        prefix (:obj:`str`, `optional`):
            dictionary에 `prefix`가 포함되어 저장됨
        is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`):
            이 프로세스가 main process인지 여부(logging/save를 수행해야 하는지 여부를 결정하는 데 사용됨)
    """
    assert (
        len(predictions) == 2
    ), "`predictions` should be a tuple with two elements (start_logits, end_logits)."
    all_start_logits, all_end_logits = predictions

    assert len(predictions[0]) == len(
        features
    ), f"Got {len(predictions[0])} predictions and {len(features)} features."

    # example과 mapping되는 feature 생성
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # prediction, nbest에 해당하는 OrderedDict 생성합니다.
    all_predictions = collections.OrderedDict()
    all_nbest_json = collections.OrderedDict()
    if version_2_with_negative:
        scores_diff_json = collections.OrderedDict()

    # Logging.
    logger.setLevel(logging.INFO if is_world_process_zero else logging.WARN)
    logger.info(
        f"Post-processing {len(examples)} example predictions split into {len(features)} features."
    )

    # 전체 example들에 대한 main Loop
    for example_index, example in enumerate(tqdm(examples)):
        # 해당하는 현재 example index
        feature_indices = features_per_example[example_index]

        min_null_prediction = None
        prelim_predictions = []

        # 현재 example에 대한 모든 feature 생성합니다.
        for feature_index in feature_indices:
            # 각 featureure에 대한 모든 prediction을 가져옵니다.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # logit과 original context의 logit을 mapping합니다.
            offset_mapping = features[feature_index]["offset_mapping"]
            # Optional : `token_is_max_context`, 제공되는 경우 현재 기능에서 사용할 수 있는 max context가 없는 answer를 제거합니다
            token_is_max_context = features[feature_index].get(
                "token_is_max_context", None
            )

            # minimum null prediction을 업데이트 합니다.
            feature_null_score = start_logits[0] + end_logits[0]
            if (
                min_null_prediction is None
                or min_null_prediction["score"] > feature_null_score
            ):
                min_null_prediction = {
                    "offsets": (0, 0),
                    "score": feature_null_score,
                    "start_logit": start_logits[0],
                    "end_logit": end_logits[0],
                }

            # `n_best_size`보다 큰 start and end logits을 살펴봅니다.
            start_indexes = np.argsort(start_logits)[
                -1 : -n_best_size - 1 : -1
            ].tolist()

            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    # out-of-scope answers는 고려하지 않습니다.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # 길이가 < 0 또는 > max_answer_length인 answer도 고려하지 않습니다.
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue
                    # 최대 context가 없는 answer도 고려하지 않습니다.
                    if (
                        token_is_max_context is not None
                        and not token_is_max_context.get(str(start_index), False)
                    ):
                        continue
                    prelim_predictions.append(
                        {
                            "offsets": (
                                offset_mapping[start_index][0],
                                offset_mapping[end_index][1],
                            ),
                            "score": start_logits[start_index] + end_logits[end_index],
                            "start_logit": start_logits[start_index],
                            "end_logit": end_logits[end_index],
                        }
                    )

        if version_2_with_negative:
            # minimum null prediction을 추가합니다.
            prelim_predictions.append(min_null_prediction)
            null_score = min_null_prediction["score"]

        # 가장 좋은 `n_best_size` predictions만 유지합니다.
        predictions = sorted(
            prelim_predictions, key=lambda x: x["score"], reverse=True
        )[:n_best_size]

        # 낮은 점수로 인해 제거된 경우 minimum null prediction을 다시 추가합니다.
        if version_2_with_negative and not any(
            p["offsets"] == (0, 0) for p in predictions
        ):
            predictions.append(min_null_prediction)

        # offset을 사용하여 original context에서 answer text를 수집합니다.
        context = example["context"]
        for pred in predictions:
            offsets = pred.pop("offsets")
            pred["text"] = context[offsets[0] : offsets[1]]

        # rare edge case에는 null이 아닌 예측이 하나도 없으며 failure를 피하기 위해 fake prediction을 만듭니다.
        if len(predictions) == 0 or (
            len(predictions) == 1 and predictions[0]["text"] == ""
        ):

            predictions.insert(
                0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0}
            )

        # 모든 점수의 소프트맥스를 계산합니다(we do it with numpy to stay independent from torch/tf in this file, using the LogSumExp trick).
        scores = np.array([pred.pop("score") for pred in predictions])
        exp_scores = np.exp(scores - np.max(scores))
        probs = exp_scores / exp_scores.sum()

        # 예측값에 확률을 포함합니다.
        for prob, pred in zip(probs, predictions):
            pred["probability"] = prob

        # best prediction을 선택합니다.
        if not version_2_with_negative:
            all_predictions[example["id"]] = predictions[0]["text"]
        else:
            # else case : 먼저 비어 있지 않은 최상의 예측을 찾아야 합니다
            i = 0
            while predictions[i]["text"] == "":
                i += 1
            best_non_null_pred = predictions[i]

            # threshold를 사용해서 null prediction을 비교합니다.
            score_diff = (
                null_score
                - best_non_null_pred["start_logit"]
                - best_non_null_pred["end_logit"]
            )
            scores_diff_json[example["id"]] = float(score_diff)  # JSON-serializable 가능
            if score_diff > null_score_diff_threshold:
                all_predictions[example["id"]] = ""
            else:
                all_predictions[example["id"]] = best_non_null_pred["text"]

        # np.float를 다시 float로 casting -> `predictions`은 JSON-serializable 가능
        all_nbest_json[example["id"]] = [
            {
                k: (
                    float(v)
                    if isinstance(v, (np.float16, np.float32, np.float64))
                    else v
                )
                for k, v in pred.items()
            }
            for pred in predictions
        ]

    # output_dir이 있으면 모든 dicts를 저장합니다.
    if output_dir is not None:
        assert os.path.isdir(output_dir), f"{output_dir} is not a directory."

        prediction_file = os.path.join(
            output_dir,
            "predictions.json" if prefix is None else f"predictions_{prefix}.json",
        )
        nbest_file = os.path.join(
            output_dir,
            "nbest_predictions.json"
            if prefix is None
            else f"nbest_predictions_{prefix}.json",
        )
        if version_2_with_negative:
            null_odds_file = os.path.join(
                output_dir,
                "null_odds.json" if prefix is None else f"null_odds_{prefix}.json",
            )

        logger.info(f"Saving predictions to {prediction_file}.")
        with open(prediction_file, "w", encoding="utf-8") as writer:
            writer.write(
                json.dumps(all_predictions, indent=4, ensure_ascii=False) + "\n"
            )
        logger.info(f"Saving nbest_preds to {nbest_file}.")
        with open(nbest_file, "w", encoding="utf-8") as writer:
            writer.write(
                json.dumps(all_nbest_json, indent=4, ensure_ascii=False) + "\n"
            )
        if version_2_with_negative:
            logger.info(f"Saving null_odds to {null_odds_file}.")
            with open(null_odds_file, "w", encoding="utf-8") as writer:
                writer.write(
                    json.dumps(scores_diff_json, indent=4, ensure_ascii=False) + "\n"
                )

    return all_predictions


def check_no_error(
    data_args: DataTrainingArguments,
    training_args: TrainingArguments,
    datasets: DatasetDict,
    tokenizer,
) -> Tuple[Any, int]:

    # last checkpoint 찾기.
    last_checkpoint = None
    if (
        os.path.isdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Tokenizer check: 해당 script는 Fast tokenizer를 필요로합니다.
    if not isinstance(tokenizer, PreTrainedTokenizerFast):
        raise ValueError(
            "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
            "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this "
            "requirement"
        )

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warn(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    if "validation" not in datasets:
        raise ValueError("--do_eval requires a validation dataset")
    return last_checkpoint, max_seq_length


### 후처리 함수 정의


In [41]:
# 모델이 이해하는 형태에서 사람이 이해하는 형태로 답변 매칭
def post_processing_function(examples, features, predictions):
    # Post-processing: we match the start logits and end logits to answers in the original context.
    predictions = postprocess_qa_predictions(
        examples=examples,
        features=features,
        predictions=predictions,
        version_2_with_negative=False,
        n_best_size=n_best_size,
        max_answer_length=max_answer_length,
        null_score_diff_threshold=0.0,
        output_dir=training_args.output_dir,
        is_world_process_zero=trainer.is_world_process_zero(),
    )

    # Format the result to the format the metric expects.
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in datasets["validation"]]
    return EvalPrediction(predictions=formatted_predictions, label_ids=references)

In [42]:
def compute_metrics(p: EvalPrediction):
    return metric.compute(predictions=p.predictions, references=p.label_ids)

## Train !


In [43]:
import wandb

wandb.login()
wandb.init(project='odqa', # 실험기록을 관리한 프로젝트 이름
           entity='nlp15', # 사용자명 또는 팀 이름        
          )

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [44]:
import torch

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.is_available()

True

In [45]:
training_args = TrainingArguments(
    output_dir="outputs",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    save_total_limit=1,  # 저장할 체크포인트의 최대 수
    #evaluation_strategy="steps",
    #eval_steps=500,  # 몇 스텝마다 평가할지 설정
    #logging_steps=500,  # 몇 스텝마다 로깅할지 설정,
    #load_best_model_at_end=True
)

In [46]:
trainer = QuestionAnsweringTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        eval_examples=val_dataset,
        tokenizer=tokenizer,
        data_collator=default_data_collator, # 보통 default
        post_process_function=post_processing_function, # function을 input으로 받음!
        compute_metrics=compute_metrics
    )

In [47]:
train_result = trainer.train()

***** Running training *****
  Num examples = 62641
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3916
  Number of trainable parameters = 356600834
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
500,0.7693
1000,0.5415
1500,0.5238
2000,0.4836
2500,0.4108
3000,0.3793
3500,0.3503


Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
Deleting older checkpoint [outputs/checkpoint-3500] due to args.save_total_limit
Saving model checkpoint to outputs/checkpoint-1000
Configuration saved in outputs/checkpoint-1000/config.json
Model weights saved in outputs/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1000/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [outputs/checkpoint-500] due to args.save_total_limit
Saving model checkpoint to outputs/checkpoint-1500
Configuration saved in outputs/checkpoint-1500/config.json
Model weights saved in outputs/checkpoint-1500/pytorch_model.bin
tokenizer c

In [48]:
wandb.finish()

0,1
train/epoch,▁▂▃▄▅▆▇█
train/global_step,▁▂▃▄▅▆▇█
train/learning_rate,█▇▆▄▃▂▁
train/loss,█▄▄▃▂▁▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,1.0
train/global_step,3916.0
train/learning_rate,1e-05
train/loss,0.3503
train/total_flos,6.221469142067405e+16
train/train_loss,0.47893
train/train_runtime,6235.4731
train/train_samples_per_second,10.046
train/train_steps_per_second,0.628


In [49]:
model

CNN_RobertaForQuestionAnswering(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
  

## 허깅페이스에 모델 업로드

In [50]:
!sudo apt-get install git-lfs

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/bin/bash: sudo: command not found


In [51]:
from transformers import AutoModel
from transformers import AutoTokenizer



# Huggingface Access Token
ACCESS_TOKEN = # 토큰아이디 입력하시면 됩니다

# Upload to Huggingface
model.push_to_hub('klue_roberta_large_finetuned_korquad_v1', use_temp_dir=True, use_auth_token=ACCESS_TOKEN)
tokenizer.push_to_hub('klue_roberta_large_finetuned_korquad_v1', use_temp_dir=True, use_auth_token=ACCESS_TOKEN)
 

Configuration saved in /tmp/tmp_r5qc71i/config.json
Model weights saved in /tmp/tmp_r5qc71i/pytorch_model.bin
Uploading the following files to ssunbear/klue_roberta_large_finetuned_korquad_v1: pytorch_model.bin,config.json
pytorch_model.bin: 100%|██████████| 1.43G/1.43G [00:48<00:00, 29.1MB/s]   
tokenizer config file saved in /tmp/tmpxrg7mu12/tokenizer_config.json
Special tokens file saved in /tmp/tmpxrg7mu12/special_tokens_map.json
Uploading the following files to ssunbear/klue_roberta_large_finetuned_korquad_v1: tokenizer_config.json,vocab.txt,tokenizer.json,special_tokens_map.json


CommitInfo(commit_url='https://huggingface.co/ssunbear/klue_roberta_large_finetuned_korquad_v1/commit/bd3c260ee793ce46ab444055515bf02be74f2a80', commit_message='Upload tokenizer', commit_description='', oid='bd3c260ee793ce46ab444055515bf02be74f2a80', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ssunbear/klue_roberta_large_finetuned_korquad_v1', endpoint='https://huggingface.co', repo_type='model', repo_id='ssunbear/klue_roberta_large_finetuned_korquad_v1'), pr_revision=None, pr_num=None)

: 

### Method 2


- Method2. Method1에 mrc_train 데이터셋으로 한번 더 fine-tuning(모델 재호출) -> ssunbear/klue_roberta_large_finetuned_korquad_v2

## Pre-trained 모델 불러오기

In [52]:
from transformers import (
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoTokenizer
)

model_name = "ssunbear/klue_roberta_large_finetuned_korquad_v1"

config = AutoConfig.from_pretrained(
    model_name
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True
)
model = AutoModelForQuestionAnswering.from_pretrained(
    model_name,
    config=config)

loading configuration file config.json from cache at /data/ephemeral/home/.cache/huggingface/hub/models--ssunbear--klue_roberta_large_finetuned_korquad_v1/snapshots/0ebea4e740f7702b26666664e61deaca4f7cb0dc/config.json
Model config RobertaConfig {
  "_name_or_path": "ssunbear/klue_roberta_large_finetuned_korquad_v1",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertTokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 1,
  "use_ca

## Train 데이터셋 전처리

In [53]:
max_seq_length = 512 # 질문과 컨텍스트, special token을 합한 문자열의 최대 길이 (일정 개수가 넘어가지 않도록!)
pad_to_max_length = False
doc_stride = 128 # 컨텍스트가 너무 길어서 나눴을 때 오버랩되는 시퀀스 길이, 문서 2개로 쪼개고, 128개 시퀀스가 겹치도록
preprocessing_num_workers = None
batch_size = 16
num_train_epochs = 4
n_best_size = 20
max_answer_length = 30


In [54]:
def prepare_train_features(examples): # examples: 데이터셋 row..
    # 주어진 텍스트를 토크나이징 한다. 이 때 텍스트의 길이가 max_seq_length를 넘으면 stride만큼 슬라이딩하며 여러 개로 쪼갬.
    # 즉, 하나의 example에서 일부분이 겹치는 여러 sequence(feature)가 생길 수 있음.
    
    question_column_name = "question" if "question" in column_names else column_names[0]
    context_column_name = "context" if "context" in column_names else column_names[1]
    answer_column_name = "answers" if "answers" in column_names else column_names[2]

    # Padding에 대한 옵션을 설정합니다.
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",  # max_seq_length까지 truncate한다. pair의 두번째 파트(context)만 잘라냄.
        max_length=max_seq_length,
        stride=doc_stride,
        return_overflowing_tokens=True, # 길이를 넘어가는 토큰들을 반환할 것인지
        return_offsets_mapping=True,  # 각 토큰에 대해 (char_start, char_end) 정보를 반환한 것인지
        padding="max_length", return_token_type_ids=False
    )

    # example 하나가 여러 sequence에 대응하는 경우를 위해 매핑이 필요함.
    overflow_to_sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # offset_mappings으로 토큰이 원본 context 내 몇번째 글자부터 몇번째 글자까지 해당하는지 알 수 있음.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # 정답지를 만들기 위한 리스트
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # 해당 example에 해당하는 sequence를 찾음.
        sequence_ids = tokenized_examples.sequence_ids(i)

        # sequence가 속하는 example을 찾는다.
        example_index = overflow_to_sample_mapping[i]
        answers = examples["answers"][example_index]

        # 텍스트에서 answer의 시작점, 끝점
        answer_start_offset = answers["answer_start"][0]
        answer_end_offset = answer_start_offset + len(answers["text"][0])

        # 텍스트에서 현재 span의 시작 토큰 인덱스
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        # 텍스트에서 현재 span 끝 토큰 인덱스
        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        # answer가 현재 span을 벗어났는지 체크
        if not (offsets[token_start_index][0] <= answer_start_offset and offsets[token_end_index][1] >= answer_end_offset):
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # token_start_index와 token_end_index를 answer의 시작점과 끝점으로 옮김
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= answer_start_offset:
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)
            while offsets[token_end_index][1] >= answer_end_offset:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [55]:
column_names = mrc_train_dataset.column_names
train_dataset = mrc_train_dataset.map(
            prepare_train_features,
            batched=True,
            num_proc=preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=True,
        )

Map: 100%|██████████| 3952/3952 [00:02<00:00, 1426.98 examples/s]


In [82]:
from datasets import load_from_disk

# 데이터셋 로드
mrc_validation_dataset_path = "/data/ephemeral/home/level2-mrc-nlp-15/data/train_dataset/validation"  # 실제 데이터셋 경로로 수정
mrc_validation_dataset = load_from_disk(mrc_validation_dataset_path)
mrc_validation_dataset

Dataset({
    features: ['title', 'context', 'question', 'id', 'answers', 'document_id', '__index_level_0__'],
    num_rows: 240
})

In [83]:
# korquad 데이터셋이랑 형식 똑같이 만들어주기
id_list0 = []
title_list0 = []
context_list0 = []
question_list0 = []
answers_list0 = []

In [84]:
id_list0

[]

In [85]:

for index, row in pd.DataFrame(mrc_validation_dataset).iterrows():
    id_list0.append(row['id'])
    title_list0.append(str(row['title']))
    context_list0.append(str(row['context']))
    question_list0.append(str(row['question']))
    answers_list0.append(row['answers'])


In [88]:
mrc_validation_dataset = {
    "id" : id_list0,
    "title" : title_list0,
    "context" : context_list0,
    "question" : question_list0,
    "answers" : answers_list0,}

In [91]:
from datasets import Dataset

mrc_validation_dataset= Dataset.from_dict(mrc_validation_dataset)
mrc_validation_dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 240
})

In [56]:
def prepare_validation_features(examples):
    tokenized_examples = tokenizer(
        examples['question'],
        examples['context'],
        truncation="only_second",
        max_length=max_seq_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1

        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [92]:
eval_dataset = mrc_validation_dataset.map(
            prepare_validation_features,
            batched=True,
            num_proc=preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=True,
        )

Map: 100%|██████████| 240/240 [00:00<00:00, 846.63 examples/s]


## Question Answering Class 정의

In [93]:
# default_data_collator: 여러개 example들을 collator해주는 역할,
# TrainingArguments : 한번에 training arguments들을 합쳐서 주는..!
from transformers import default_data_collator, TrainingArguments, EvalPrediction

In [94]:
# coding=utf-8
# Copyright 2020 The HuggingFace Team All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Question-Answering task와 관련된 'Trainer'의 subclass 코드 입니다.
"""

from transformers import Trainer, is_datasets_available, is_torch_tpu_available
from transformers.trainer_utils import PredictionOutput

if is_datasets_available():
    import datasets

# Huggingface의 Trainer를 상속받아 QuestionAnswering을 위한 Trainer를 생성합니다.
class QuestionAnsweringTrainer(Trainer):
    def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.eval_examples = eval_examples
        self.post_process_function = post_process_function

    def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None):
        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
        eval_dataloader = self.get_eval_dataloader(eval_dataset)
        eval_examples = self.eval_examples if eval_examples is None else eval_examples

        # 일시적으로 metric computation를 불가능하게 한 상태이며, 해당 코드에서는 loop 내에서 metric 계산을 수행합니다.
        compute_metrics = self.compute_metrics
        self.compute_metrics = None
        try:
            output = self.prediction_loop(
                eval_dataloader,
                description="Evaluation",
                # metric이 없으면 예측값을 모으는 이유가 없으므로 아래의 코드를 따르게 됩니다.
                # self.args.prediction_loss_only
                prediction_loss_only=True if compute_metrics is None else None,
                ignore_keys=ignore_keys,
            )
        finally:
            self.compute_metrics = compute_metrics

        if isinstance(eval_dataset, datasets.Dataset):
            eval_dataset.set_format(
                type=eval_dataset.format["type"],
                columns=list(eval_dataset.features.keys()),
            )

        if self.post_process_function is not None and self.compute_metrics is not None:
            eval_preds = self.post_process_function(
                eval_examples, eval_dataset, output.predictions, self.args
            )
            metrics = self.compute_metrics(eval_preds)

            self.log(metrics)
        else:
            metrics = {}

        self.control = self.callback_handler.on_evaluate(
            self.args, self.state, self.control, metrics
        )
        return metrics

    def predict(self, test_dataset, test_examples, ignore_keys=None):
        test_dataloader = self.get_test_dataloader(test_dataset)

        # 일시적으로 metric computation를 불가능하게 한 상태이며, 해당 코드에서는 loop 내에서 metric 계산을 수행합니다.
        # evaluate 함수와 동일하게 구성되어있습니다
        compute_metrics = self.compute_metrics
        self.compute_metrics = None
        try:
            output = self.prediction_loop(
                test_dataloader,
                description="Evaluation",
                # metric이 없으면 예측값을 모으는 이유가 없으므로 아래의 코드를 따르게 됩니다.
                # self.args.prediction_loss_only
                prediction_loss_only=True if compute_metrics is None else None,
                ignore_keys=ignore_keys,
            )
        finally:
            self.compute_metrics = compute_metrics

        if self.post_process_function is None or self.compute_metrics is None:
            return output

        if isinstance(test_dataset, datasets.Dataset):
            test_dataset.set_format(
                type=test_dataset.format["type"],
                columns=list(test_dataset.features.keys()),
            )

        predictions = self.post_process_function(
            test_examples, test_dataset, output.predictions, self.args
        )
        return predictions


## 후처리 클래스 정의

In [95]:
# coding=utf-8
# Copyright 2020 The HuggingFace Team All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Pre-processing
Post-processing utilities for question answering.
"""
import collections
import json
import logging
import os
import random
from typing import Any, Optional, Tuple

import numpy as np
import torch
from arguments import DataTrainingArguments, ModelArguments
from datasets import DatasetDict
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast, TrainingArguments, is_torch_available
from transformers.trainer_utils import get_last_checkpoint

#from utils.datetime_helper import get_seoul_datetime_str

logger = logging.getLogger(__name__)


def set_seed(seed: int = 2024):
    """
    seed 고정하는 함수 (random, numpy, torch)

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # if use multi-GPU
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


def postprocess_qa_predictions(
    examples,
    features,
    predictions: Tuple[np.ndarray, np.ndarray],
    version_2_with_negative: bool = False,
    n_best_size: int = 20,
    max_answer_length: int = 30,
    null_score_diff_threshold: float = 0.0,
    output_dir: Optional[str] = None,
    prefix: Optional[str] = None,
    is_world_process_zero: bool = True,
):
    """
    Post-processes : qa model의 prediction 값을 후처리하는 함수
    모델은 start logit과 end logit을 반환하기 때문에, 이를 기반으로 original text로 변경하는 후처리가 필요함

    Args:
        examples: 전처리 되지 않은 데이터셋 (see the main script for more information).
        features: 전처리가 진행된 데이터셋 (see the main script for more information).
        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
            모델의 예측값 :start logits과 the end logits을 나타내는 two arrays              첫번째 차원은 :obj:`features`의 element와 갯수가 맞아야함.
        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
            정답이 없는 데이터셋이 포함되어있는지 여부를 나타냄
        n_best_size (:obj:`int`, `optional`, defaults to 20):
            답변을 찾을 때 생성할 n-best prediction 총 개수
        max_answer_length (:obj:`int`, `optional`, defaults to 30):
            생성할 수 있는 답변의 최대 길이
        null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
            null 답변을 선택하는 데 사용되는 threshold
            : if the best answer has a score that is less than the score of
            the null answer minus this threshold, the null answer is selected for this example (note that the score of
            the null answer for an example giving several features is the minimum of the scores for the null answer on
            each feature: all features must be aligned on the fact they `want` to predict a null answer).
            Only useful when :obj:`version_2_with_negative` is :obj:`True`.
        output_dir (:obj:`str`, `optional`):
            아래의 값이 저장되는 경로
            dictionary : predictions, n_best predictions (with their scores and logits) if:obj:`version_2_with_negative=True`,
            dictionary : the scores differences between best and null answers
        prefix (:obj:`str`, `optional`):
            dictionary에 `prefix`가 포함되어 저장됨
        is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`):
            이 프로세스가 main process인지 여부(logging/save를 수행해야 하는지 여부를 결정하는 데 사용됨)
    """
    assert (
        len(predictions) == 2
    ), "`predictions` should be a tuple with two elements (start_logits, end_logits)."
    all_start_logits, all_end_logits = predictions

    assert len(predictions[0]) == len(
        features
    ), f"Got {len(predictions[0])} predictions and {len(features)} features."

    # example과 mapping되는 feature 생성
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # prediction, nbest에 해당하는 OrderedDict 생성합니다.
    all_predictions = collections.OrderedDict()
    all_nbest_json = collections.OrderedDict()
    if version_2_with_negative:
        scores_diff_json = collections.OrderedDict()

    # Logging.
    logger.setLevel(logging.INFO if is_world_process_zero else logging.WARN)
    logger.info(
        f"Post-processing {len(examples)} example predictions split into {len(features)} features."
    )

    # 전체 example들에 대한 main Loop
    for example_index, example in enumerate(tqdm(examples)):
        # 해당하는 현재 example index
        feature_indices = features_per_example[example_index]

        min_null_prediction = None
        prelim_predictions = []

        # 현재 example에 대한 모든 feature 생성합니다.
        for feature_index in feature_indices:
            # 각 featureure에 대한 모든 prediction을 가져옵니다.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # logit과 original context의 logit을 mapping합니다.
            offset_mapping = features[feature_index]["offset_mapping"]
            # Optional : `token_is_max_context`, 제공되는 경우 현재 기능에서 사용할 수 있는 max context가 없는 answer를 제거합니다
            token_is_max_context = features[feature_index].get(
                "token_is_max_context", None
            )

            # minimum null prediction을 업데이트 합니다.
            feature_null_score = start_logits[0] + end_logits[0]
            if (
                min_null_prediction is None
                or min_null_prediction["score"] > feature_null_score
            ):
                min_null_prediction = {
                    "offsets": (0, 0),
                    "score": feature_null_score,
                    "start_logit": start_logits[0],
                    "end_logit": end_logits[0],
                }

            # `n_best_size`보다 큰 start and end logits을 살펴봅니다.
            start_indexes = np.argsort(start_logits)[
                -1 : -n_best_size - 1 : -1
            ].tolist()

            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    # out-of-scope answers는 고려하지 않습니다.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # 길이가 < 0 또는 > max_answer_length인 answer도 고려하지 않습니다.
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue
                    # 최대 context가 없는 answer도 고려하지 않습니다.
                    if (
                        token_is_max_context is not None
                        and not token_is_max_context.get(str(start_index), False)
                    ):
                        continue
                    prelim_predictions.append(
                        {
                            "offsets": (
                                offset_mapping[start_index][0],
                                offset_mapping[end_index][1],
                            ),
                            "score": start_logits[start_index] + end_logits[end_index],
                            "start_logit": start_logits[start_index],
                            "end_logit": end_logits[end_index],
                        }
                    )

        if version_2_with_negative:
            # minimum null prediction을 추가합니다.
            prelim_predictions.append(min_null_prediction)
            null_score = min_null_prediction["score"]

        # 가장 좋은 `n_best_size` predictions만 유지합니다.
        predictions = sorted(
            prelim_predictions, key=lambda x: x["score"], reverse=True
        )[:n_best_size]

        # 낮은 점수로 인해 제거된 경우 minimum null prediction을 다시 추가합니다.
        if version_2_with_negative and not any(
            p["offsets"] == (0, 0) for p in predictions
        ):
            predictions.append(min_null_prediction)

        # offset을 사용하여 original context에서 answer text를 수집합니다.
        context = example["context"]
        for pred in predictions:
            offsets = pred.pop("offsets")
            pred["text"] = context[offsets[0] : offsets[1]]

        # rare edge case에는 null이 아닌 예측이 하나도 없으며 failure를 피하기 위해 fake prediction을 만듭니다.
        if len(predictions) == 0 or (
            len(predictions) == 1 and predictions[0]["text"] == ""
        ):

            predictions.insert(
                0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0}
            )

        # 모든 점수의 소프트맥스를 계산합니다(we do it with numpy to stay independent from torch/tf in this file, using the LogSumExp trick).
        scores = np.array([pred.pop("score") for pred in predictions])
        exp_scores = np.exp(scores - np.max(scores))
        probs = exp_scores / exp_scores.sum()

        # 예측값에 확률을 포함합니다.
        for prob, pred in zip(probs, predictions):
            pred["probability"] = prob

        # best prediction을 선택합니다.
        if not version_2_with_negative:
            all_predictions[example["id"]] = predictions[0]["text"]
        else:
            # else case : 먼저 비어 있지 않은 최상의 예측을 찾아야 합니다
            i = 0
            while predictions[i]["text"] == "":
                i += 1
            best_non_null_pred = predictions[i]

            # threshold를 사용해서 null prediction을 비교합니다.
            score_diff = (
                null_score
                - best_non_null_pred["start_logit"]
                - best_non_null_pred["end_logit"]
            )
            scores_diff_json[example["id"]] = float(score_diff)  # JSON-serializable 가능
            if score_diff > null_score_diff_threshold:
                all_predictions[example["id"]] = ""
            else:
                all_predictions[example["id"]] = best_non_null_pred["text"]

        # np.float를 다시 float로 casting -> `predictions`은 JSON-serializable 가능
        all_nbest_json[example["id"]] = [
            {
                k: (
                    float(v)
                    if isinstance(v, (np.float16, np.float32, np.float64))
                    else v
                )
                for k, v in pred.items()
            }
            for pred in predictions
        ]

    # output_dir이 있으면 모든 dicts를 저장합니다.
    if output_dir is not None:
        assert os.path.isdir(output_dir), f"{output_dir} is not a directory."

        prediction_file = os.path.join(
            output_dir,
            "predictions.json" if prefix is None else f"predictions_{prefix}.json",
        )
        nbest_file = os.path.join(
            output_dir,
            "nbest_predictions.json"
            if prefix is None
            else f"nbest_predictions_{prefix}.json",
        )
        if version_2_with_negative:
            null_odds_file = os.path.join(
                output_dir,
                "null_odds.json" if prefix is None else f"null_odds_{prefix}.json",
            )

        logger.info(f"Saving predictions to {prediction_file}.")
        with open(prediction_file, "w", encoding="utf-8") as writer:
            writer.write(
                json.dumps(all_predictions, indent=4, ensure_ascii=False) + "\n"
            )
        logger.info(f"Saving nbest_preds to {nbest_file}.")
        with open(nbest_file, "w", encoding="utf-8") as writer:
            writer.write(
                json.dumps(all_nbest_json, indent=4, ensure_ascii=False) + "\n"
            )
        if version_2_with_negative:
            logger.info(f"Saving null_odds to {null_odds_file}.")
            with open(null_odds_file, "w", encoding="utf-8") as writer:
                writer.write(
                    json.dumps(scores_diff_json, indent=4, ensure_ascii=False) + "\n"
                )

    return all_predictions


def check_no_error(
    data_args: DataTrainingArguments,
    training_args: TrainingArguments,
    datasets: DatasetDict,
    tokenizer,
) -> Tuple[Any, int]:

    # last checkpoint 찾기.
    last_checkpoint = None
    if (
        os.path.isdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Tokenizer check: 해당 script는 Fast tokenizer를 필요로합니다.
    if not isinstance(tokenizer, PreTrainedTokenizerFast):
        raise ValueError(
            "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
            "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this "
            "requirement"
        )

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warn(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    if "validation" not in datasets:
        raise ValueError("--do_eval requires a validation dataset")
    return last_checkpoint, max_seq_length


### 후처리 함수 정의

In [96]:
# 모델이 이해하는 형태에서 사람이 이해하는 형태로 답변 매칭
def post_processing_function(examples, features, predictions):
    # Post-processing: we match the start logits and end logits to answers in the original context.
    predictions = postprocess_qa_predictions(
        examples=examples,
        features=features,
        predictions=predictions,
        version_2_with_negative=False,
        n_best_size=n_best_size,
        max_answer_length=max_answer_length,
        null_score_diff_threshold=0.0,
        output_dir=training_args.output_dir,
        is_world_process_zero=trainer.is_world_process_zero(),
    )

    # Format the result to the format the metric expects.
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in datasets["validation"]]
    return EvalPrediction(predictions=formatted_predictions, label_ids=references)

In [97]:
def compute_metrics(p: EvalPrediction):
    return metric.compute(predictions=p.predictions, references=p.label_ids)

## Train 2차

In [98]:
import wandb

wandb.login()
wandb.init(project='odqa', # 실험기록을 관리한 프로젝트 이름
           entity='nlp15', # 사용자명 또는 팀 이름        
          )

In [99]:
import torch

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.is_available()

True

In [100]:
training_args = TrainingArguments(
    output_dir="outputs",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    save_total_limit=1,  # 저장할 체크포인트의 최대 수
    #evaluation_strategy="steps",
    #eval_steps=500,  # 몇 스텝마다 평가할지 설정
    #logging_steps=500,  # 몇 스텝마다 로깅할지 설정,
    #load_best_model_at_end=True
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [101]:
trainer = QuestionAnsweringTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        eval_examples=val_dataset,
        tokenizer=tokenizer,
        data_collator=default_data_collator, # 보통 default
        post_process_function=post_processing_function, # function을 input으로 받음!
        compute_metrics=compute_metrics
    )

In [102]:
train_result = trainer.train()

***** Running training *****
  Num examples = 5769
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1444
  Number of trainable parameters = 335608834
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
500,0.7901
1000,0.2584


Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
Deleting older checkpoint [outputs/checkpoint-3500] due to args.save_total_limit
Saving model checkpoint to outputs/checkpoint-1000
Configuration saved in outputs/checkpoint-1000/config.json
Model weights saved in outputs/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1000/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [outputs/checkpoint-500] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)




In [103]:
wandb.finish()

0,1
train/epoch,▁▅█
train/global_step,▁▅█
train/learning_rate,█▁
train/loss,█▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,4.0
train/global_step,1444.0
train/learning_rate,2e-05
train/loss,0.2584
train/total_flos,2.143084255034573e+16
train/train_loss,0.38528
train/train_runtime,2161.033
train/train_samples_per_second,10.678
train/train_steps_per_second,0.668


In [104]:
model

RobertaForQuestionAnswering(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
      

## 허깅페이스에 모델 업로드

In [105]:
!sudo apt-get install git-lfs

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/bin/bash: sudo: command not found


In [106]:
from transformers import AutoModel
from transformers import AutoTokenizer


# Huggingface Access Token
ACCESS_TOKEN = #토큰아이디 입력하시면 됩니다.

# Upload to Huggingface
model.push_to_hub('klue_roberta_large_finetuned_korquad_v2', use_temp_dir=True, use_auth_token=ACCESS_TOKEN)
tokenizer.push_to_hub('klue_roberta_large_finetuned_korquad_v2', use_temp_dir=True, use_auth_token=ACCESS_TOKEN)
 

Configuration saved in /tmp/tmpvxgja89o/config.json
Model weights saved in /tmp/tmpvxgja89o/pytorch_model.bin
Uploading the following files to ssunbear/klue_roberta_large_finetuned_korquad_v2: pytorch_model.bin,config.json
pytorch_model.bin: 100%|██████████| 1.34G/1.34G [00:56<00:00, 23.7MB/s]   
tokenizer config file saved in /tmp/tmp1q6nsung/tokenizer_config.json
Special tokens file saved in /tmp/tmp1q6nsung/special_tokens_map.json
Uploading the following files to ssunbear/klue_roberta_large_finetuned_korquad_v2: tokenizer_config.json,vocab.txt,tokenizer.json,special_tokens_map.json


CommitInfo(commit_url='https://huggingface.co/ssunbear/klue_roberta_large_finetuned_korquad_v2/commit/32b49af3b113769f65b0ab2392cfb81d4c159962', commit_message='Upload tokenizer', commit_description='', oid='32b49af3b113769f65b0ab2392cfb81d4c159962', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ssunbear/klue_roberta_large_finetuned_korquad_v2', endpoint='https://huggingface.co', repo_type='model', repo_id='ssunbear/klue_roberta_large_finetuned_korquad_v2'), pr_revision=None, pr_num=None)

In [None]:
import logging
import os
import sys
import wandb

from datasets import DatasetDict
import evaluate
import argparse
from trainer_qa import QuestionAnsweringTrainer
from transformers import (
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    TrainingArguments,
)
from utils_qa import set_seed, check_no_error, postprocess_qa_predictions
from omegaconf import OmegaConf
from omegaconf import DictConfig
from utils.naming import wandb_naming
from prepare_dataset import prepare_dataset

## FINISH


In [1]:
from CNN_layer_model import CNN_RobertaForQuestionAnswering
from transformers import (
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoTokenizer
)

model_name = "CurtisJeon/klue-roberta-large-korquad_v1_qa"

config = AutoConfig.from_pretrained(
    model_name
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True
)
model = CNN_RobertaForQuestionAnswering.from_pretrained(
    model_name,
    config=config)

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
from transformers import (
    AutoConfig,
    AutoTokenizer
)
from custom_model_copy import CNN_RobertaForQuestionAnswering


In [24]:
model_name = "CurtisJeon/klue-roberta-large-korquad_v1_qa"

config = AutoConfig.from_pretrained(
    model_name
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_fast=True
)
model = CNN_RobertaForQuestionAnswering.from_pretrained(
    model_name)

Some weights of CNN_RobertaForQuestionAnswering were not initialized from the model checkpoint at CurtisJeon/klue-roberta-large-korquad_v1_qa and are newly initialized: ['cnn_block2.layer_norm.bias', 'cnn_block4.conv2.bias', 'cnn_block5.conv2.weight', 'cnn_block2.conv1.weight', 'cnn_block2.conv1.bias', 'cnn_block4.conv2.weight', 'cnn_block4.layer_norm.bias', 'cnn_block1.layer_norm.bias', 'cnn_block2.layer_norm.weight', 'cnn_block4.layer_norm.weight', 'cnn_block4.conv1.weight', 'cnn_block5.layer_norm.bias', 'cnn_block3.layer_norm.bias', 'cnn_block4.conv1.bias', 'cnn_block1.conv2.weight', 'cnn_block1.layer_norm.weight', 'cnn_block5.layer_norm.weight', 'cnn_block3.conv2.weight', 'cnn_block3.conv2.bias', 'cnn_block5.conv1.bias', 'cnn_block3.conv1.weight', 'cnn_block2.conv2.bias', 'cnn_block3.layer_norm.weight', 'cnn_block1.conv1.weight', 'cnn_block1.conv1.bias', 'cnn_block2.conv2.weight', 'cnn_block5.conv1.weight', 'cnn_block3.conv1.bias', 'cnn_block1.conv2.bias', 'cnn_block5.conv2.bias']


In [25]:
model

CNN_RobertaForQuestionAnswering(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
  