In [2]:
%env WANDB_PROJECT=PII

env: WANDB_PROJECT=PII


In [1]:
%env WANDB_RUN_GROUP=base-stride-filter-original-stride=384-gamma=0-v2

env: WANDB_RUN_GROUP=base-stride-filter-original-stride=384-gamma=0-v2


In [3]:
!pip install -q seqeval evaluate

[0m

In [4]:
import json
import copy
import gc
import os
import re
from functools import reduce
from collections import defaultdict
from pathlib import Path

import torch
from torch import Tensor, nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from spacy.lang.en import English
from transformers.tokenization_utils import PreTrainedTokenizerBase
from transformers.models.deberta_v2 import DebertaV2ForTokenClassification, DebertaV2TokenizerFast
from transformers.trainer import Trainer
from transformers.training_args import TrainingArguments
from transformers.trainer_utils import EvalPrediction, PredictionOutput
from transformers.data.data_collator import DataCollatorForTokenClassification
from sklearn.model_selection import KFold
from datasets import Dataset, DatasetDict, concatenate_datasets
from seqeval.metrics import recall_score, precision_score
import wandb

In [5]:
wandb.login(key="eff994fe72307679c21248b6e7859e26960b8db7")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Config & Parameters

In [14]:
DATA_DIR = Path("../dataset/")
OUTPUT_DIR = "output2"
Path(OUTPUT_DIR).mkdir(exist_ok=True)

In [15]:
# TRAINING_MODEL_PATH = "hf-internal-testing/tiny-random-deberta-v2" 
TRAINING_MODEL_PATH = "microsoft/deberta-v3-base"
# TRAINING_MODEL_PATH = "Gladiator/microsoft-deberta-v3-large_ner_conll2003"
MAX_LENGTH = 1024  if "tiny-random" not in TRAINING_MODEL_PATH else 512
TRAIN_STRIDE = 384
EVAL_STRIDE = 384
CONF_THRESH = 0.9
LR = 2.5e-5  # 1.5e-5 ~ 3e-5 for base # 5e-6 ~ 1e-5 for large
LR_SCHEDULER_TYPE = "linear"
NUM_EPOCHS = 5 if "tiny-random" not in TRAINING_MODEL_PATH else 0.1
BATCH_SIZE = 16
EVAL_BATCH_SIZE = 1
GRAD_ACCUMULATION_STEPS = 1
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01
AMP = True
FREEZE_EMBEDDING = False
FREEZE_LAYERS = 0
GAMMA = 0
MASK_P = 0
# training data
N_SPLITS = 4
FILTER_ORIGINAL = True
MOTH = False
PJMATHMATICIAN = False
NICHOLAS = False
MPWARE = False

In [16]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    fp16=AMP,
    learning_rate=LR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
    report_to="wandb",
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=1,
    logging_steps=10,
    metric_for_best_model="f5",
    greater_is_better=True,
    load_best_model_at_end=False,
    overwrite_output_dir=True,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=WEIGHT_DECAY,
)

## Dataset Preparation

In [17]:
with DATA_DIR.joinpath("train.json").open("r") as f:
    original_data = json.load(f)

extra_data = []  #

if MOTH:
    with DATA_DIR.joinpath("pii_dataset_fixed.json").open("r") as f:
        moth = json.load(f)
    print("Moth datapoints: ", len(moth))
    extra_data.extend(moth)

if PJMATHMATICIAN:
    with DATA_DIR.joinpath("moredata_dataset_fixed.json").open("r") as f:
        pjmathmatician = json.load(f)
    print("PJMathmatician datapoints: ", len(pjmathmatician))
    extra_data.extend(pjmathmatician)

if NICHOLAS:
    with DATA_DIR.joinpath("mixtral-8x7b-v1.json").open("r") as f:
        nicholas = json.load(f)
    print("Nicholas datapoints: ", len(nicholas))
    extra_data.extend(nicholas)
    
if MPWARE:
    with DATA_DIR.joinpath("mixtral-8x7b-v1.json").open("r") as f:
        mpware = json.load(f)
    print("MPWARE datapoints: ", len(mpware))
    extra_data.extend(mpware)
    
print(f"len(extra_data): {len(extra_data)}")

len(extra_data): 0


In [18]:
all_labels = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'O'
]
id2label = {i: l for i, l in enumerate(all_labels)}
label2id = {v: k for k, v in id2label.items()}
target = [l for l in all_labels if l != "O"]

## Tokenization

In [19]:
class CustomTokenizer:
    def __init__(
        self, 
        tokenizer: PreTrainedTokenizerBase, 
        label2id: dict, 
        max_length: int,
        stride: int,
    ) -> None:
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_length = max_length
        self.stride = stride
        
    def encode_for_train(self, batch: dict) -> dict:
        examples = [{k: v[i] for k, v in batch.items()} for i in range(len(batch["tokens"]))]
        tokenized = [self._encode_train_example(example) for example in examples]
        return {k: reduce(lambda i, j: i + j, [t[k] for t in tokenized]) for k in tokenized[0].keys()}
        
    def _encode_train_example(self, example: dict) -> dict:
        # rebuild text from tokens
        text, labels = [], []

        for idx, (t, l, ws) in enumerate(
            zip(example["tokens"], example["provided_labels"], example["trailing_whitespace"])
        ):
            text.append(t)
            labels.extend([l] * len(t))

            if ws:
                text.append(" ")
                labels.append("O")

        text = "".join(text)
        labels = np.array(labels)

        # tokenize the text with max_length and stride to retain the data
        tokenized = tokenizer(
            text, 
            max_length=self.max_length,
            return_overflowing_tokens=True, 
            stride=self.stride, 
            return_offsets_mapping=True,
            truncation=True,
        )

        token_labels = [] # a list of list due to long sentences split up
        for sequence_offset_mapping in tokenized.offset_mapping:
            sequence_token_labels = []
            for start_idx, end_idx in sequence_offset_mapping:
                if start_idx == 0 and end_idx == 0: # CLS token
                    sequence_token_labels.append(label2id["O"]) 
                    continue
                if text[start_idx].isspace(): # text start with a whitespace
                    start_idx += 1
                label = labels[start_idx]
                sequence_token_labels.append(label2id[label])
            token_labels.append(sequence_token_labels)
        tokenized['labels'] = token_labels

        return tokenized

    def encode_for_eval(self, example: dict) -> dict:
        # rebuild text from tokens
        text, labels, token_map = [], [], []

        for idx, (t, l, ws) in enumerate(
            zip(example["tokens"], example["provided_labels"], example["trailing_whitespace"])
        ):
            text.append(t)
            labels.extend([l] * len(t))
            token_map.extend([idx]*len(t))

            if ws:
                text.append(" ")
                labels.append("O")
                token_map.append(-1)

        text = "".join(text)
        labels = np.array(labels)

        # actual tokenization
        tokenized = self.tokenizer(
            "".join(text),
            return_offsets_mapping=True,
            truncation=False,
        )

        token_labels = []

        for start_idx, end_idx in tokenized.offset_mapping:
            # CLS token
            if start_idx == 0 and end_idx == 0:
                token_labels.append(self.label2id["O"])
                continue

            # case when token starts with whitespace
            if text[start_idx].isspace():
                start_idx += 1

            token_labels.append(self.label2id[labels[start_idx]])

        length = len(tokenized.input_ids)

        return {**tokenized, "labels": token_labels, "length": length, "token_map": token_map}

## Augmentation

In [20]:
class RandomCutOut:
    def __init__(self, mask_p: float, mask_token_id: int) -> None:
        self.mask_p = mask_p
        self.mask_token_id = mask_token_id
        
    def __call__(self, batch: dict) -> dict:
        if self.mask_p == 0:
            return batch
        new_input_ids_list = []
        for input_ids in batch["input_ids"]:
            mask = np.random.binomial(1, p=self.mask_p, size=(len(input_ids),))
            new_input_ids = np.where(mask, self.mask_token_id, input_ids)
            new_input_ids_list.append(new_input_ids.tolist())
        batch["input_ids"] = new_input_ids_list
        return batch

## Instantiate the dataset

In [21]:
tokenizer = DebertaV2TokenizerFast.from_pretrained(TRAINING_MODEL_PATH)
encoder = CustomTokenizer(
    tokenizer=tokenizer, 
    label2id=label2id, 
    max_length=MAX_LENGTH, 
    stride=TRAIN_STRIDE,
)
train_augmentation = RandomCutOut(mask_p=MASK_P, mask_token_id=tokenizer.mask_token_id)

ds = DatasetDict()

for key, data in zip(["original", "extra"], [original_data, extra_data]):
    ds[key] = Dataset.from_dict({
        "full_text": [x["full_text"] for x in data],
        "document": [str(x["document"]) for x in data],
        "tokens": [x["tokens"] for x in data],
        "trailing_whitespace": [x["trailing_whitespace"] for x in data],
        "provided_labels": [x["labels"] for x in data],
    })

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Metrics

In [22]:
class MetricsComputer:
    def __init__(self, all_labels: list[str], beta: float = 5.0) -> None:
        self.all_labels = all_labels
        self.beta = beta
        
    def __call__(self, preds: EvalPrediction) -> dict[str, float]:
        predictions, labels = preds
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [self.all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [self.all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        recall = recall_score(true_labels, true_predictions)
        precision = precision_score(true_labels, true_predictions)
        f5_score = (1 + self.beta ** 2) * recall * precision / ((self.beta ** 2) * precision + recall)

        results = {
            'recall': recall,
            'precision': precision,
            'f5': f5_score
        }
        return results

def find_span(target: list[str], document: list[str]) -> list[list[int]]:
    idx = 0
    spans = []
    span = []

    for i, token in enumerate(document):
        if token != target[idx]:
            idx = 0
            span = []
            continue
        span.append(i)
        idx += 1
        if idx == len(target):
            spans.append(span)
            span = []
            idx = 0
            continue
    
    return spans


class PRFScore:
    """A precision / recall / F score."""

    def __init__(
        self,
        *,
        tp: int = 0,
        fp: int = 0,
        fn: int = 0,
    ) -> None:
        self.tp = tp
        self.fp = fp
        self.fn = fn

    def __len__(self) -> int:
        return self.tp + self.fp + self.fn

    def __iadd__(self, other):  # in-place add
        self.tp += other.tp
        self.fp += other.fp
        self.fn += other.fn
        return self

    def __add__(self, other):
        return PRFScore(
            tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
        )

    def score_set(self, cand: set, gold: set) -> None:
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
        self.fn += len(gold - cand)

    @property
    def precision(self) -> float:
        return self.tp / (self.tp + self.fp + 1e-100)

    @property
    def recall(self) -> float:
        return self.tp / (self.tp + self.fn + 1e-100)

    @property
    def f1(self) -> float:
        p = self.precision
        r = self.recall
        return 2 * ((p * r) / (p + r + 1e-100))

    @property
    def f5(self) -> float:
        beta = 5
        p = self.precision
        r = self.recall

        fbeta = (1+(beta**2))*p*r / ((beta**2)*p + r + 1e-100)
        return fbeta

    def to_dict(self) -> dict[str, float]:
        return {"p": self.precision, "r": self.recall, "f5": self.f5}


class MetricsComputerV2:
    nlp = English()

    def __init__(self, eval_ds: Dataset, label2id: dict, conf_thresh: float = 0.9) -> None:
        self.ds = eval_ds.remove_columns("labels").rename_columns({"provided_labels": "labels"})
        self.gt_df = self.create_gt_df(self.ds)
        self.label2id = label2id
        self.confth = conf_thresh
        self._search_gt()
        
    def __call__(self, eval_preds: EvalPrediction) -> dict:
        pred_df = self.create_pred_df(eval_preds.predictions)
        return self.compute_metrics_from_df(self.gt_df, pred_df)
    
    def _search_gt(self) -> None:
        email_regex = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
        phone_num_regex = re.compile(r"(\(\d{3}\)\d{3}\-\d{4}\w*|\d{3}\.\d{3}\.\d{4})\s")
        self.emails = []
        self.phone_nums = []

        for _data in self.ds:
            # email
            for token_idx, token in enumerate(_data["tokens"]):
                if re.fullmatch(email_regex, token) is not None:
                    self.emails.append(
                        {"document": _data["document"], "token": token_idx, "label": "B-EMAIL", "token_str": token}
                    )
            # phone number
            matches = phone_num_regex.findall(_data["full_text"])
            if not matches:
                continue
            for match in matches:
                target = [t.text for t in self.nlp.tokenizer(match)]
                matched_spans = find_span(target, _data["tokens"])
            for matched_span in matched_spans:
                for intermediate, token_idx in enumerate(matched_span):
                    prefix = "I" if intermediate else "B"
                    self.phone_nums.append(
                        {"document": _data["document"], "token": token_idx, "label": f"{prefix}-PHONE_NUM", "token_str": _data["tokens"][token_idx]}
                    )
        
    @staticmethod
    def create_gt_df(ds: Dataset):
        gt = []
        for row in ds:
            for token_idx, (token, label) in enumerate(zip(row["tokens"], row["labels"])):
                if label == "O":
                    continue
                gt.append(
                    {"document": row["document"], "token": token_idx, "label": label, "token_str": token}
                )
        gt_df = pd.DataFrame(gt)
        gt_df["row_id"] = gt_df.index
        
        return gt_df
    
    def create_pred_df(self, prediction: np.ndarray) -> pd.DataFrame:
        ### construct prediction df
        o_index = self.label2id["O"]
        preds = prediction.argmax(-1)
        preds_without_o = prediction[:,:,:o_index].argmax(-1)
        o_preds = prediction[:,:,o_index]
        preds_final = np.where(o_preds < self.confth, preds_without_o , preds)

        pairs = set()
        processed = []

        # Iterate over document
        for p_doc, token_map, offsets, tokens, doc in zip(
            preds_final, self.ds["token_map"], self.ds["offset_mapping"], self.ds["tokens"], self.ds["document"]
        ):
            # Iterate over sequence
            for p_token, (start_idx, end_idx) in zip(p_doc, offsets):
                label_pred = id2label[p_token]

                if start_idx + end_idx == 0:
                    # [CLS] token i.e. BOS
                    continue

                if token_map[start_idx] == -1:
                    start_idx += 1

                # ignore "\n\n"
                while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                    start_idx += 1

                if start_idx >= len(token_map): 
                    break

                token_id = token_map[start_idx]
                pair = (doc, token_id)

                # ignore "O", preds, phone number and  email
                if label_pred in ("O", "B-EMAIL", "B-PHONE_NUM", "B-PHONE_NUM") or token_id == -1:
                    continue   

                if pair in pairs:
                    continue

                processed.append(
                    {"document": doc, "token": token_id, "label": label_pred, "token_str": tokens[token_id]}
                )
                pairs.add(pair)

        pred_df = pd.DataFrame(processed + self.emails + self.phone_nums)
        pred_df["row_id"] = list(range(len(pred_df)))

        return pred_df
        
    def compute_metrics_from_df(self, gt_df, pred_df):
        """
        Compute the LB metric (lb) and other auxiliary metrics
        """

        references = {(row.document, row.token, row.label) for row in gt_df.itertuples()}
        predictions = {(row.document, row.token, row.label) for row in pred_df.itertuples()}

        score_per_type = defaultdict(PRFScore)
        references = set(references)

        for ex in predictions:
            pred_type = ex[-1] # (document, token, label)
            if pred_type != 'O':
                pred_type = pred_type[2:] # avoid B- and I- prefix

            if pred_type not in score_per_type:
                score_per_type[pred_type] = PRFScore()

            if ex in references:
                score_per_type[pred_type].tp += 1
                references.remove(ex)
            else:
                score_per_type[pred_type].fp += 1

        for doc, tok, ref_type in references:
            if ref_type != 'O':
                ref_type = ref_type[2:] # avoid B- and I- prefix

            if ref_type not in score_per_type:
                score_per_type[ref_type] = PRFScore()
            score_per_type[ref_type].fn += 1

        totals = PRFScore()

        for prf in score_per_type.values():
            totals += prf

        return {
            "precision": totals.precision,
            "recall": totals.recall,
            "f5": totals.f5,
            **{
                f"{v_k}-{k}": v_v 
                for k in set([l[2:] for l in self.label2id.keys() if l!= 'O'])
                for v_k, v_v in score_per_type[k].to_dict().items()
            },
        }

## Model

In [23]:
class ModelInit:
    def __init__(
        self, 
        checkpoint: str, 
        id2label: dict, 
        label2id: dict,
        freeze_embedding: bool,
        freeze_layers: int,
    ) -> None:
        self.model = DebertaV2ForTokenClassification.from_pretrained(
            checkpoint,
            num_labels=len(id2label),
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes=True
        )
        for param in self.model.deberta.embeddings.parameters():
            param.requires_grad = False if freeze_embedding else True
        for layer in self.model.deberta.encoder.layer[:freeze_layers]:
            for param in layer.parameters():
                param.requires_grad = False
        self.weight = copy.deepcopy(self.model.state_dict())

    def __call__(self) -> DebertaV2ForTokenClassification:
        self.model.load_state_dict(self.weight)
        return self.model

model_init = ModelInit(
    TRAINING_MODEL_PATH, 
    id2label=id2label, 
    label2id=label2id, 
    freeze_embedding=FREEZE_EMBEDDING, 
    freeze_layers=FREEZE_LAYERS,
)

Downloading pytorch_model.bin:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForTokenClassification: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'deberta.embeddings.position_embeddings.weight', 'mask_predictions.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a Be

## Split

In [24]:
# split according to document id
folds = [
    (
        np.array([i for i, d in enumerate(ds["original"]["document"]) if int(d) % N_SPLITS != s]),
        np.array([i for i, d in enumerate(ds["original"]["document"]) if int(d) % N_SPLITS == s])
    )
    for s in range(N_SPLITS)
]

exclude_indices = []
if FILTER_ORIGINAL:
    negative_idxs = [i for i, labels in enumerate(ds["original"]["provided_labels"]) if not any(np.array(labels) != "O")]
    exclude_indices = negative_idxs[len(negative_idxs)//3:]

## Trainer with custom loss and stride evaluation

In [25]:
class FocalLoss(nn.Module):
    def __init__(
        self, weight = None, gamma = 2., reduction = "mean", ignore_index: int = -100
    ) -> None:
        super().__init__()
        self.ce = nn.CrossEntropyLoss(weight=weight, reduction="none", ignore_index=ignore_index)
        self.ignore_index = ignore_index
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, target):
        logits = logits.permute(0, 2, 1)  # b, seq, c -> b, c, seq
        mask = target != self.ignore_index
        ce_loss = self.ce(logits, target)
        pt = torch.exp(-ce_loss)
        f_loss = (1 - pt) ** self.gamma * ce_loss

        if self.reduction == "mean":
            f_loss = torch.mean(torch.masked_select(f_loss, mask))
        elif self.reduction == "sum":
            f_loss = torch.sum(torch.masked_select(f_loss, mask))
        else:
            f_loss = torch.where(mask, f_loss, 0)

        return f_loss

In [26]:
class SlidingSplitter:
    """reproduce return_overflowing_tokens behavior given input_ids and labels"""
    def __init__(
        self,
        tokenizer: PreTrainedTokenizerBase, 
        max_length: int, 
        stride: int,
        ignore_index: int = -100,
    ) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.stride = stride
        self.step = self.max_length - self.stride - 2
        self.ignore_index = ignore_index

    def __call__(self, inputs: dict) -> list[dict]:
        input_ids = inputs["input_ids"]
        labels = inputs.get("labels")
        eos_idx = (input_ids[0] == self.tokenizer.eos_token_id).nonzero().item()
        input_ids = input_ids[0, 1:eos_idx]
        if labels is not None:
            labels = labels[0, 1:eos_idx]
        start_idx = 0
        chunked = []
        while True:
            _input = dict()
            _input_ids = input_ids[start_idx:start_idx + self.max_length - 2]
            _input_ids = self.pad_input_ids(_input_ids)
            _input_ids = _input_ids.unsqueeze(0)
            _input["input_ids"] = _input_ids
            _input["token_type_ids"] = torch.zeros_like(_input_ids)
            _input["attention_mask"] = torch.ones_like(_input_ids)
            if labels is not None:
                _labels = labels[start_idx:start_idx + self.max_length - 2]
                _labels = self.pad_labels(_labels)
                _input["labels"] = _labels.unsqueeze(0)
            chunked.append(_input)
            if start_idx >= len(input_ids) - self.max_length + 2:
                break
            start_idx += self.max_length - self.stride - 2
        
        return chunked
    
    def pad_input_ids(self, input_ids: Tensor) -> Tensor:
        """pad first and last with [BOS] & [SEP]"""
        bos = torch.tensor([self.tokenizer.bos_token_id], dtype=int, device=input_ids.device)
        eos = torch.tensor([self.tokenizer.eos_token_id], dtype=int, device=input_ids.device)
        return torch.concat([bos, input_ids, eos], dim=0)
    
    def pad_labels(self, labels: Tensor) -> Tensor:
        """pad first and last with self.ignore_index"""
        ignore_index = torch.tensor([self.ignore_index], dtype=int, device=labels.device)
        return torch.concat([ignore_index, labels, ignore_index], dim=0)

In [27]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        _, outputs = super().compute_loss(model, inputs, True)
        labels = inputs["labels"]
        loss = FocalLoss(gamma=GAMMA)(logits=outputs["logits"], target=labels)
        outputs["loss"] = loss
        return (loss, outputs) if return_outputs else loss
    
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys):
        inputs = self._prepare_inputs(inputs)
        has_labels = "labels" in inputs
        splitter = SlidingSplitter(self.tokenizer, max_length=MAX_LENGTH, stride=EVAL_STRIDE)
        chunked_inputs = splitter(inputs)
        loss, logits = [], []
        prev_tail = None
        for i, _inputs in enumerate(chunked_inputs):
            if has_labels:
                with self.compute_loss_context_manager(), torch.no_grad():
                    _loss, out = self.compute_loss(model, _inputs, return_outputs=True)
                    _loss = _loss
            else:
                _loss = None
                with self.compute_loss_context_manager(). torch.no_gard():
                    out = model(**_inputs)

            _logits = out.logits[0, 1:-1]  # remove [CLS] & [SEP]
            if i != len(chunked_inputs) - 1:  # not last
                head, middle, tail = torch.tensor_split(
                    _logits, (splitter.stride, _logits.size(0) - splitter.stride)
                )
            else:  # last
                head, middle, tail = torch.tensor_split(
                    _logits, (splitter.stride, _logits.size(0))
                )
            if prev_tail is not None:
                # overlap process logic
                head = torch.stack([prev_tail, head], dim=0).mean(dim=0)
            logits.append(head)
            logits.append(middle)
            prev_tail = tail
            loss.append(_loss)
        loss = torch.stack(loss, dim=0).mean(0) if has_labels else None
        logits = torch.vstack(logits)
        pad_len = (inputs["input_ids"] == self.tokenizer.pad_token_id).sum()
        logits = torch.nn.functional.pad(logits, (0, 0, 1, 1 + pad_len), mode="constant", value=0)
        logits = logits.unsqueeze(0)
        return loss, logits, inputs.get("labels")

## Train

#### CV

In [None]:
for fold_idx, (train_idx, eval_idx) in enumerate(folds):
    args.run_name = f"fold-{fold_idx}"
    args.output_dir = os.path.join(OUTPUT_DIR, f"fold_{fold_idx}")
    if Path(args.output_dir).joinpath("eval_result.json").exists():
        continue
    original_ds = ds["original"].select([i for i in train_idx if i not in exclude_indices])
    train_ds = concatenate_datasets([original_ds, ds["extra"]])
    train_ds = train_ds.map(
        encoder.encode_for_train, num_proc=os.cpu_count(), batched=True, remove_columns=train_ds.column_names
    )
    train_ds.set_transform(train_augmentation)
    eval_ds = ds["original"].select(eval_idx)
    eval_ds = eval_ds.map(encoder.encode_for_eval, num_proc=os.cpu_count())
    trainer = CustomTrainer(
        args=args,
        model_init=model_init,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        compute_metrics=MetricsComputerV2(eval_ds=eval_ds, label2id=label2id),
        data_collator=DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16),
    )
    trainer.train()
    eval_res = trainer.evaluate(eval_dataset=eval_ds)
    with open(os.path.join(args.output_dir, "eval_result.json"), "w") as f:
        json.dump(eval_res, f)
    del trainer
    gc.collect()
    torch.cuda.empty_cache()
    wandb.finish()

         

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

         

#0:   0%|          | 0/213 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/213 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/212 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/212 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/212 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/212 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/212 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/212 [00:00<?, ?ex/s]

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `DebertaV2ForTokenClassification.forward` and have been ignored: offset_mapping, overflow_to_sample_mapping. If offset_mapping, overflow_to_sample_mapping are not expected by `DebertaV2ForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2494
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 780
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33memiz6413[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669082160418233, max=1.0…

Step,Training Loss,Validation Loss,Precision,Recall,F5,P-username,R-username,F5-username,P-id Num,R-id Num,F5-id Num,P-url Personal,R-url Personal,F5-url Personal,P-email,R-email,F5-email,P-street Address,R-street Address,F5-street Address,P-name Student,R-name Student,F5-name Student,P-phone Num,R-phone Num,F5-phone Num
50,0.0261,0.012056,0.115942,0.012158,0.012592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.017,0.005948,1.0,0.012158,0.012638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150,0.0116,0.004733,0.31769,0.668693,0.641435,0.0,0.0,0.0,1.0,0.076923,0.079755,0.15528,1.0,0.826972,1.0,1.0,1.0,0.0,0.0,0.0,0.333608,0.677258,0.651448,0.0,0.0,0.0
200,0.0051,0.002388,0.60387,0.901216,0.884465,0.0,0.0,0.0,0.947368,0.692308,0.699552,0.3125,1.0,0.921986,1.0,1.0,1.0,0.0,0.0,0.0,0.619429,0.906355,0.89049,0.0,0.0,0.0
250,0.0039,0.001652,0.603647,0.955927,0.934942,0.0,0.0,0.0,0.793103,0.884615,0.880707,0.735294,1.0,0.986343,1.0,1.0,1.0,0.0,0.0,0.0,0.590113,0.958194,0.935745,0.0,0.0,0.0
300,0.0013,0.001174,0.801351,0.901216,0.896917,0.0,0.0,0.0,0.954545,0.807692,0.8125,0.952381,0.8,0.804954,1.0,1.0,1.0,0.0,0.0,0.0,0.78955,0.909699,0.904406,0.0,0.0,0.0


The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForTokenClassification.forward` and have been ignored: tokens, full_text, length, token_map, provided_labels, trailing_whitespace, document, offset_mapping. If tokens, full_text, length, token_map, provided_labels, trailing_whitespace, document, offset_mapping are not expected by `DebertaV2ForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1698
  Batch size = 1
Saving model checkpoint to output2/fold_0/checkpoint-50
Configuration saved in output2/fold_0/checkpoint-50/config.json
Model weights saved in output2/fold_0/checkpoint-50/pytorch_model.bin
tokenizer config file saved in output2/fold_0/checkpoint-50/tokenizer_config.json
Special tokens file saved in output2/fold_0/checkpoint-50/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForTokenClassification.forwar

#### Train with full data

In [None]:
original_ds = ds["original"].select([i for i in range(len(ds["original"])) if i not in exclude_indices])
train_ds = concatenate_datasets([original_ds, ds["extra"]])
train_ds = train_ds.map(
    encoder.encode_for_train, num_proc=os.cpu_count(), batched=True, remove_columns=train_ds.column_names
)
train_ds.set_transform(train_augmentation)
args.evaluation_strategy = "no"
args.save_strategy = "no"
args.run_name = f"all_data"
trainer = CustomTrainer(
    args=args,
    model_init=model_init,
    train_dataset=train_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16),
)
trainer.train()
trainer.save_model(os.path.join(OUTPUT_DIR, "all"))

# log cv result
results = dict()
for res_json_path in Path(OUTPUT_DIR).glob("fold*/eval_result.json"):
    fold = res_json_path.parent.name.split("_")[-1]
    with open(res_json_path, "r") as f:
        res = json.load(f)
        results[fold] = {k.replace("eval_", ""): v for k, v in res.items()}
results["cv"] = {key: np.mean([r[key] for r in results.values()]) for key in results["0"].keys()}
table = wandb.Table(columns=["fold"] + list(results["0"].keys()))
for f, res in results.items():
    table.add_data(f, *[res[c] for c in table.columns if c != "fold"])
wandb.log({"eval_result": table})
                           
wandb.finish()