In [2]:
!pip install -q seqeval evaluate

[0m

In [1]:
%env WANDB_PROJECT=PII

env: WANDB_PROJECT=PII


In [4]:
import json
import copy
import gc
import os
import re
from collections import defaultdict
from pathlib import Path

import torch
import numpy as np
import pandas as pd
from spacy.lang.en import English
from transformers.tokenization_utils import PreTrainedTokenizerBase
from transformers.models.deberta_v2 import DebertaV2ForTokenClassification, DebertaV2TokenizerFast
from transformers.trainer import Trainer
from transformers.training_args import TrainingArguments
from transformers.trainer_utils import EvalPrediction, PredictionOutput
from transformers.data.data_collator import DataCollatorForTokenClassification
from sklearn.model_selection import KFold
from datasets import Dataset, DatasetDict, concatenate_datasets
from seqeval.metrics import recall_score, precision_score
import wandb

In [5]:
wandb.login(key="eff994fe72307679c21248b6e7859e26960b8db7")

[34m[1mwandb[0m: Currently logged in as: [33memiz6413[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Config & Parameters

In [6]:
DATA_DIR = Path("../dataset/")
OUTPUT_DIR = "output2"
Path(OUTPUT_DIR).mkdir(exist_ok=True)

In [7]:
TRAINING_MODEL_PATH = "Gladiator/microsoft-deberta-v3-large_ner_conll2003"
TRAIN_MAX_LENGTH = 1024
EVAL_MAX_LENGTH = 3072
STRIDE = 256
#EVAL_MAX_LENGTH = 3072
CONF_THRESH = 0.9
LR = 2e-5  # 1.5e-5 ~ 3e-5 for base # 5e-6 ~ 1e-5 for large
LR_SCHEDULER_TYPE = "cosine_with_restarts"
NUM_EPOCHS = 2
BATCH_SIZE = 4
GRAD_ACCUMULATION_STEPS = 8  # total_effective_batch_size = 32
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01
AMP = True
FREEZE_EMBEDDING = False
FREEZE_LAYERS = 0
# training data
N_SPLITS = 4
FILTER_ORIGINAL = False
EXTRA_1 = False
EXTRA_2 = False
EXTRA_3 = False

In [8]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    fp16=AMP,
    learning_rate=LR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
    report_to="wandb",
    evaluation_strategy="steps",
    eval_steps=50,
    eval_delay=100,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=1,
    logging_steps=10,
    metric_for_best_model="f5",
    greater_is_better=True,
    load_best_model_at_end=True,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=WEIGHT_DECAY,
)

## Dataset Preparation

In [9]:
with DATA_DIR.joinpath("train.json").open("r") as f:
    original_data = json.load(f)

# downsampling of negative examples
p=[] # positive samples (contain relevant labels)
n=[] # negative samples (presumably contain entities that are possibly wrongly classified as entity)

for d in original_data:
    if any(np.array(d["labels"]) != "O"):
        p.append(d)
    else:
        n.append(d)

print("original datapoints: ", len(original_data))
print("positive datapoints:", len(p))
print("negative datapoints", len(n))
if FILTER_ORIGINAL:
    original_data = p + n[:len(n)//3]

extra_data = []  #

# Moth
if EXTRA_1:
    with DATA_DIR.joinpath("pii_dataset_fixed.json").open("r") as f:
        external_1 = json.load(f)
    print("external_1 datapoints: ", len(external_1))
    extra_data.extend(external_1)

# PJMathmatician
if EXTRA_2:
    with DATA_DIR.joinpath("moredata_dataset_fixed.json").open("r") as f:
        external_2 = json.load(f)
    print("external_2 datapoints: ", len(external_2))
    extra_data.extend(external_2)

# Nicholas
if EXTRA_3:
    with DATA_DIR.joinpath("mixtral-8x7b-v1.json").open("r") as f:
        external_3 = json.load(f)
    print("external_3 datapoints: ", len(external_3))
    extra_data.extend(external_3)
    
print(f"len(extra_data): {len(extra_data)}")

original datapoints:  6807
positive datapoints: 945
negative datapoints 5862
len(extra_data): 0


In [10]:
all_labels = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'O'
]
id2label = {i: l for i, l in enumerate(all_labels)}
label2id = {v: k for k, v in id2label.items()}
target = [l for l in all_labels if l != "O"]

## Tokenization

In [14]:
class CustomTokenizer:
    def __init__(
        self, 
        tokenizer: PreTrainedTokenizerBase, 
        label2id: dict, 
        max_length: int,
        stride: int,
    ) -> None:
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_length = max_length
        self.stride = stride
        
    def encode_for_train(self, example: dict) -> dict:
        tokens, labels = [], []
        for token, label, t_ws in zip(
            example["tokens"], example["provided_labels"], example["trailing_whitespace"]
        ):
            tokens.append(token)
            labels.extend([label] * len(token))
            if t_ws: # trainling whitespace
                tokens.append(" ") # add space in tokens list
                labels.append("O") # add 'O' label in labels list
        text = "".join(tokens) # merge all the tokens to a text (string)

        # tokenize the text with max_length and stride to retain the data
        tokenized = tokenizer(
            text, 
            max_length=self.max_length,
            return_overflowing_tokens=True, 
            stride=self.stride, 
            return_offsets_mapping=True,
            truncation=True,
        )    

        token_labels = [] # a list of list due to long sentences split up
        for sequence_offset_mapping in tokenized.offset_mapping:
            sequence_token_labels = []
            for start_idx, end_idx in sequence_offset_mapping:
                if start_idx == 0 and end_idx == 0: # CLS token
                    sequence_token_labels.append(label2id["O"]) 
                    continue
                if text[start_idx].isspace(): # text start with a whitespace
                    start_idx += 1
                label = labels[start_idx]
                sequence_token_labels.append(label2id[label])
            token_labels.append(sequence_token_labels)
        tokenized['labels'] = token_labels

        return {**tokenized}

    def encode_for_eval(self, example: dict) -> dict:
        # rebuild text from tokens
        text, labels, token_map = [], [], []

        for idx, (t, l, ws) in enumerate(
            zip(example["tokens"], example["provided_labels"], example["trailing_whitespace"])
        ):
            text.append(t)
            labels.extend([l] * len(t))
            token_map.extend([idx]*len(t))

            if ws:
                text.append(" ")
                labels.append("O")
                token_map.append(-1)

        text = "".join(text)
        labels = np.array(labels)

        # actual tokenization
        tokenized = self.tokenizer(
            "".join(text),
            return_offsets_mapping=True,
            truncation=True,
            max_length=self.max_length
        )

        token_labels = []

        for start_idx, end_idx in tokenized.offset_mapping:
            # CLS token
            if start_idx == 0 and end_idx == 0:
                token_labels.append(self.label2id["O"])
                continue

            # case when token starts with whitespace
            if text[start_idx].isspace():
                start_idx += 1

            token_labels.append(self.label2id[labels[start_idx]])

        length = len(tokenized.input_ids)

        return {**tokenized, "labels": token_labels, "length": length, "token_map": token_map}

### Sliding window expansion

In [10]:
def rebuild_text(tokens, trailing_whitespace):
    text = ""
    for token, ws in zip(tokens, trailing_whitespace):
        ws = " " if ws else ""
        text += token + ws
    return text

def stride_rows(data, max_length, stride):
    expanded = []
    for row in data:
        tokens = row['tokens']
        if len(tokens) > max_length:
            start = 0
            while start < len(tokens):
                remaining_tokens = len(tokens) - start
                if remaining_tokens < max_length and start != 0:
                    # Adjust start for the last window to ensure it has max_length tokens
                    start = max(0, len(tokens) - max_length)
                end = min(start + max_length, len(tokens))
                new_row = {}
                new_row['document'] = row['document']
                new_row['tokens'] = tokens[start:end]
                new_row['trailing_whitespace'] = row['trailing_whitespace'][start:end]
                new_row['labels'] = row['labels'][start:end]
                new_row['token_indices'] = list(range(start, end))
                new_row['full_text'] = rebuild_text(new_row['tokens'], new_row['trailing_whitespace'])
                expanded.append(new_row)
                if remaining_tokens >= max_length:
                    start += stride
                else:
                    # Break the loop if we've adjusted for the last window
                    break
        else:
            new_row = {
                'document': row['document'], 
                'tokens': row['tokens'], 
                'trailing_whitespace': row['trailing_whitespace'], 
                'labels': row['labels'], 
                'token_indices': list(range(len(row["tokens"]))), 
                'full_text': row['full_text']
            }
            expanded.append(new_row)
    return expanded

## Tokenizer

In [None]:
tokenizer = DebertaV2TokenizerFast.from_pretrained(TRAINING_MODEL_PATH)
train_encoder = CustomTokenizer(tokenizer=tokenizer, label2id=label2id, max_length=TRAINING_MAX_LENGTH)
eval_encoder = CustomTokenizer(tokenizer=tokenizer, label2id=label2id, max_length=EVAL_MAX_LENGTH)

NameError: name 'MAX_LENGTH' is not defined

## Metrics

In [12]:
class MetricsComputer:
    def __init__(self, all_labels: list[str], beta: float = 5.0) -> None:
        self.all_labels = all_labels
        self.beta = beta
        
    def __call__(self, preds: EvalPrediction) -> dict[str, float]:
        predictions, labels = preds
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [self.all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [self.all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        recall = recall_score(true_labels, true_predictions)
        precision = precision_score(true_labels, true_predictions)
        f5_score = (1 + self.beta ** 2) * recall * precision / ((self.beta ** 2) * precision + recall)

        results = {
            'recall': recall,
            'precision': precision,
            'f5': f5_score
        }
        return results

# compute_metrics = MetricsComputer(all_labels=all_labels)

In [13]:
def find_span(target: list[str], document: list[str]) -> list[list[int]]:
    idx = 0
    spans = []
    span = []

    for i, token in enumerate(document):
        if token != target[idx]:
            idx = 0
            span = []
            continue
        span.append(i)
        idx += 1
        if idx == len(target):
            spans.append(span)
            span = []
            idx = 0
            continue
    
    return spans

In [14]:
class PRFScore:
    """A precision / recall / F score."""

    def __init__(
        self,
        *,
        tp: int = 0,
        fp: int = 0,
        fn: int = 0,
    ) -> None:
        self.tp = tp
        self.fp = fp
        self.fn = fn

    def __len__(self) -> int:
        return self.tp + self.fp + self.fn

    def __iadd__(self, other):  # in-place add
        self.tp += other.tp
        self.fp += other.fp
        self.fn += other.fn
        return self

    def __add__(self, other):
        return PRFScore(
            tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
        )

    def score_set(self, cand: set, gold: set) -> None:
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
        self.fn += len(gold - cand)

    @property
    def precision(self) -> float:
        return self.tp / (self.tp + self.fp + 1e-100)

    @property
    def recall(self) -> float:
        return self.tp / (self.tp + self.fn + 1e-100)

    @property
    def f1(self) -> float:
        p = self.precision
        r = self.recall
        return 2 * ((p * r) / (p + r + 1e-100))

    @property
    def f5(self) -> float:
        beta = 5
        p = self.precision
        r = self.recall

        fbeta = (1+(beta**2))*p*r / ((beta**2)*p + r + 1e-100)
        return fbeta

    def to_dict(self) -> dict[str, float]:
        return {"p": self.precision, "r": self.recall, "f5": self.f5}


class MetricsComputerV2:
    nlp = English()

    def __init__(self, eval_ds: Dataset, label2id: dict, conf_thresh: float = 0.9) -> None:
        self.ds = eval_ds.remove_columns("labels").rename_columns({"provided_labels": "labels"})
        self.gt_df = self.create_gt_df(self.ds)
        self.label2id = label2id
        self.confth = conf_thresh
        
    def __call__(self, eval_preds: EvalPrediction) -> dict:
        pred_df = self.create_pred_df(eval_preds.predictions)
        return self.compute_metrics_from_df(self.gt_df, pred_df)
        
    @staticmethod
    def create_gt_df(ds: Dataset):
        gt = []
        for row in ds:
            for token_idx, (token, label) in enumerate(zip(row["tokens"], row["labels"])):
                if label == "O":
                    continue
                gt.append(
                    {"document": row["document"], "token": token_idx, "label": label, "token_str": token}
                )
        gt_df = pd.DataFrame(gt)
        gt_df["row_id"] = gt_df.index
        
        return gt_df
    
    def create_pred_df(self, prediction: np.ndarray) -> pd.DataFrame:
        ### construct prediction df
        o_index = self.label2id["O"]
        preds = prediction.argmax(-1)
        preds_without_o = prediction[:,:,:o_index].argmax(-1)
        o_preds = prediction[:,:,o_index]
        preds_final = np.where(o_preds < self.confth, preds_without_o , preds)

        triplets = set()
        processed = []

        # Iterate over document
        for p_doc, token_map, offsets, tokens, doc in zip(
            preds_final, self.ds["token_map"], self.ds["offset_mapping"], self.ds["tokens"], self.ds["document"]
        ):
            # Iterate over sequence
            for p_token, (start_idx, end_idx) in zip(p_doc, offsets):
                label_pred = id2label[p_token]

                if start_idx + end_idx == 0:
                    # [CLS] token i.e. BOS
                    continue

                if token_map[start_idx] == -1:
                    start_idx += 1

                # ignore "\n\n"
                while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                    start_idx += 1

                if start_idx >= len(token_map): 
                    break

                token_id = token_map[start_idx]
                triplet = (label_pred, token_id, tokens[token_id])

                # ignore "O", preds, phone number and  email
                if label_pred in ("O", "B-EMAIL", "B-PHONE_NUM", "B-PHONE_NUM") or token_id == -1:
                    continue   

                if triplet in triplets:
                    continue

                processed.append(
                    {"document": doc, "token": token_id, "label": label_pred, "token_str": tokens[token_id]}
                )
                triplets.add(triplet)

        email_regex = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
        phone_num_regex = re.compile(r"(\(\d{3}\)\d{3}\-\d{4}\w*|\d{3}\.\d{3}\.\d{4})\s")
        emails = []
        phone_nums = []

        for _data in self.ds:
            # email
            for token_idx, token in enumerate(_data["tokens"]):
                if re.fullmatch(email_regex, token) is not None:
                    emails.append(
                        {"document": _data["document"], "token": token_idx, "label": "B-EMAIL", "token_str": token}
                    )
            # phone number
            matches = phone_num_regex.findall(_data["full_text"])
            if not matches:
                continue
            for match in matches:
                target = [t.text for t in self.nlp.tokenizer(match)]
                matched_spans = find_span(target, _data["tokens"])
            for matched_span in matched_spans:
                for intermediate, token_idx in enumerate(matched_span):
                    prefix = "I" if intermediate else "B"
                    phone_nums.append(
                        {"document": _data["document"], "token": token_idx, "label": f"{prefix}-PHONE_NUM", "token_str": _data["tokens"][token_idx]}
                    )

        pred_df = pd.DataFrame(processed + emails + phone_nums)
        pred_df["row_id"] = list(range(len(pred_df)))

        return pred_df
        
    def compute_metrics_from_df(self, gt_df, pred_df):
        """
        Compute the LB metric (lb) and other auxiliary metrics
        """

        references = {(row.document, row.token, row.label) for row in gt_df.itertuples()}
        predictions = {(row.document, row.token, row.label) for row in pred_df.itertuples()}

        score_per_type = defaultdict(PRFScore)
        references = set(references)

        for ex in predictions:
            pred_type = ex[-1] # (document, token, label)
            if pred_type != 'O':
                pred_type = pred_type[2:] # avoid B- and I- prefix

            if pred_type not in score_per_type:
                score_per_type[pred_type] = PRFScore()

            if ex in references:
                score_per_type[pred_type].tp += 1
                references.remove(ex)
            else:
                score_per_type[pred_type].fp += 1

        for doc, tok, ref_type in references:
            if ref_type != 'O':
                ref_type = ref_type[2:] # avoid B- and I- prefix

            if ref_type not in score_per_type:
                score_per_type[ref_type] = PRFScore()
            score_per_type[ref_type].fn += 1

        totals = PRFScore()

        for prf in score_per_type.values():
            totals += prf

        return {
            "precision": totals.precision,
            "recall": totals.recall,
            "f5": totals.f5,
            **{
                f"{v_k}-{k}": v_v 
                for k in set([l[2:] for l in self.label2id.keys() if l!= 'O'])
                for v_k, v_v in score_per_type[k].to_dict().items()
            },
        }

## Model

In [15]:
class ModelInit:
    def __init__(
        self, 
        checkpoint: str, 
        id2label: dict, 
        label2id: dict,
        freeze_embedding: bool,
        freeze_layers: int,
    ) -> None:
        self.model = DebertaV2ForTokenClassification.from_pretrained(
            checkpoint,
            num_labels=len(id2label),
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes=True
        )
        for param in self.model.deberta.embeddings.parameters():
            param.requires_grad = False if freeze_embedding else True
        for layer in self.model.deberta.encoder.layer[:freeze_layers]:
            for param in layer.parameters():
                param.requires_grad = False
        self.weight = copy.deepcopy(self.model.state_dict())

    def __call__(self) -> DebertaV2ForTokenClassification:
        self.model.load_state_dict(self.weight)
        return self.model

model_init = ModelInit(
    TRAINING_MODEL_PATH, 
    id2label=id2label, 
    label2id=label2id, 
    freeze_embedding=FREEZE_EMBEDDING, 
    freeze_layers=FREEZE_LAYERS,
)

Downloading config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at Gladiator/microsoft-deberta-v3-large_ner_conll2003 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 1024]) in the checkpoint and torch.Size([13, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([13]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Split

In [16]:
# split according to document id
folds = [
    (
        np.array([i for i, d in enumerate(original_data) if int(d["document"]) % N_SPLITS != s]),
        np.array([i for i, d in enumerate(original_data) if int(d["document"]) % N_SPLITS == s])
    )
    for s in range(N_SPLITS)
]

## Train

#### CV

In [None]:
for fold_idx, (train_idx, eval_idx) in enumerate(folds):
    args.run_name = f"fold-{fold_idx}"
    args.output_dir = os.path.join(OUTPUT_DIR, f"fold_{fold_idx}")
    train_data = [d for d in original_data if int(d["document"]) % N_SPLITS != fold_idx]
    train_data = train_data + extra_data
    eval_data = [d for d in original_data if int(d["document"]) & N_SPLITS == fold_idx]
    train_data = stride_rows(train_data, max_length=TRAIN_MAX_LENGTH, stride=STRIDE)
    train_ds = Dataset.from_pandas(pd.DataFrame(train_data).rename(columns={"labels": "provided_labels"})) 
    eval_ds = Dataset.from_pandas(pd.DataFrame(eval_data).rename(columns={"labels": "provided_labels"}))
    train_ds = train_ds.map(train_encoder, num_proc=os.cpu_count())
    eval_ds = eval_ds.map(eval_encoder, num_proc=os.cpu_count())
    trainer = Trainer(
        args=args,
        model_init=model_init,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        compute_metrics=MetricsComputerV2(eval_ds=eval_ds, label2id=label2id),
        data_collator=DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16),
    )
    trainer.train()
    eval_res = trainer.evaluate(eval_dataset=eval_ds)
    with open(os.path.join(args.output_dir, "eval_result.json"), "w") as f:
        json.dump(eval_res, f)
    del trainer
    gc.collect()
    torch.cuda.empty_cache()
    wandb.finish()
    break
    
# keys = set(cv_res[0].keys())
# for key in keys:
#     wandb.log({f"cv_{key.split('_')[-1]}": np.nanmean([r.get(key, np.nan) for r in cv_res])})

         

#0:   0%|          | 0/788 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/788 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/788 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/788 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/787 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/787 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/787 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/787 [00:00<?, ?ex/s]

         

#0:   0%|          | 0/427 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/427 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/427 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/427 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/427 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/427 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/427 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/426 [00:00<?, ?ex/s]

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `DebertaV2ForTokenClassification.forward` and have been ignored: token_map, length, trailing_whitespace, full_text, document, tokens, provided_labels, offset_mapping, token_indices. If token_map, length, trailing_whitespace, full_text, document, tokens, provided_labels, offset_mapping, token_indices are not expected by `DebertaV2ForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6300
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 8
  Total optimization steps = 392
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Precision,Recall,F5,P-email,R-email,F5-email,P-name Student,R-name Student,F5-name Student,P-url Personal,R-url Personal,F5-url Personal,P-phone Num,R-phone Num,F5-phone Num,P-street Address,R-street Address,F5-street Address,P-username,R-username,F5-username,P-id Num,R-id Num,F5-id Num
100,0.004,0.003595,0.976744,0.029936,0.031095,1.0,1.0,1.0,0.0,0.0,0.0,0.8,0.060606,0.06284,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150,0.0027,0.002527,0.719309,0.831076,0.826139,1.0,1.0,1.0,0.709801,0.889246,0.880682,0.909091,0.30303,0.311005,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200,0.0024,0.001399,0.818774,0.895225,0.892021,1.0,1.0,1.0,0.826149,0.922953,0.918813,0.64,0.969697,0.950857,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1,0.103586
250,0.0019,0.000984,0.838535,0.880969,0.879258,1.0,1.0,1.0,0.8434,0.907705,0.905051,0.682353,0.878788,0.869164,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9,0.225,0.231683
300,0.0007,0.000948,0.81548,0.938703,0.933279,1.0,1.0,1.0,0.825752,0.94703,0.941711,0.656566,0.984848,0.966266,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.693878,0.85,0.842707
350,0.0011,0.000787,0.878498,0.91732,0.915764,1.0,1.0,1.0,0.889404,0.922953,0.921616,0.752941,0.969697,0.959078,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.714286,0.875,0.867493


Saving model checkpoint to output2/fold_0/checkpoint-50
Configuration saved in output2/fold_0/checkpoint-50/config.json
Model weights saved in output2/fold_0/checkpoint-50/pytorch_model.bin
tokenizer config file saved in output2/fold_0/checkpoint-50/tokenizer_config.json
Special tokens file saved in output2/fold_0/checkpoint-50/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForTokenClassification.forward` and have been ignored: token_map, length, trailing_whitespace, full_text, document, tokens, provided_labels, offset_mapping. If token_map, length, trailing_whitespace, full_text, document, tokens, provided_labels, offset_mapping are not expected by `DebertaV2ForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3415
  Batch size = 4
Saving model checkpoint to output2/fold_0/checkpoint-100
Configuration saved in output2/fold_0/checkpoint-100/config.json


#### Train with full data

In [None]:
train_ds = concatenate_datasets([ds["original"], ds["extra"]])
train_ds = train_ds.map(train_encoder, num_proc=os.cpu_count())
args.evaluation_strategy = "no"
args.run_name = f"all_data"
trainer = Trainer(
    args=args,
    model_init=model_init,
    train_dataset=train_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16),
)
trainer.train()
trainer.save_model(OUTPUT_DIR)
wandb.finish()