In [1]:
%env WANDB_PROJECT=PII

env: WANDB_PROJECT=PII


In [2]:
%env WANDB_RUN_GROUP=mlm-small-filter+M+P-gamma=0

env: WANDB_RUN_GROUP=mlm-small-filter+M+P-gamma=0


In [3]:
!pip install -q -U seqeval evaluate transformers[torch]

[0m

In [4]:
import json
import copy
import gc
import os
import re
from collections import defaultdict
from pathlib import Path

import torch
from torch import Tensor, nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from spacy.lang.en import English
from transformers.tokenization_utils import PreTrainedTokenizerBase
from transformers.models.deberta_v2 import DebertaV2ForTokenClassification, DebertaV2TokenizerFast
from transformers.trainer import Trainer
from transformers.training_args import TrainingArguments
from transformers.trainer_utils import EvalPrediction, PredictionOutput
from transformers.data.data_collator import DataCollatorForTokenClassification
from sklearn.model_selection import KFold
from datasets import Dataset, DatasetDict, concatenate_datasets
from seqeval.metrics import recall_score, precision_score
import wandb

In [5]:
wandb.login(key=WANB_KEY)

[34m[1mwandb[0m: Currently logged in as: [33memiz6413[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Config & Parameters

In [6]:
DATA_DIR = Path("../dataset/")
OUTPUT_DIR = "output"
Path(OUTPUT_DIR).mkdir(exist_ok=True)

In [7]:
# TRAINING_MODEL_PATH = "hf-internal-testing/tiny-random-deberta-v2"
TOKENIZER_CHECKPOINT = "microsoft/deberta-v3-small"
MODEL_CHECKPOINT_SUFFIX = "emiz6413/deberta-v3-small_mlm_original-only_"
TRAINING_MAX_LENGTH = 1024 if "tiny-random" not in TOKENIZER_CHECKPOINT else 512
EVAL_MAX_LENGTH = 3072 if "tiny-random" not in TOKENIZER_CHECKPOINT else 512
CONF_THRESH = 0.9
LR = 2e-5  # 1.5e-5 ~ 3e-5 for base # 5e-6 ~ 1e-5 for large
LR_SCHEDULER_TYPE = "cosine_with_restarts"
NUM_EPOCHS = 2 if "tiny-random" not in TOKENIZER_CHECKPOINT else 0.1
BATCH_SIZE = 32
EVAL_BATCH_SIZE = 16
GRAD_ACCUMULATION_STEPS = 1
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01
AMP = True
FREEZE_EMBEDDING = False
FREEZE_LAYERS = 0
GAMMA = 0
MASK_P = 0
# training data
N_SPLITS = 4
FILTER_ORIGINAL = True
EXTRA_1 = True
EXTRA_2 = True
EXTRA_3 = False

In [8]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    fp16=AMP,
    learning_rate=LR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
    report_to="wandb",
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=1,
    logging_steps=10,
    metric_for_best_model="f5",
    greater_is_better=True,
    load_best_model_at_end=False,
    overwrite_output_dir=True,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=WEIGHT_DECAY,
)

## Dataset Preparation

In [9]:
with DATA_DIR.joinpath("train.json").open("r") as f:
    original_data = json.load(f)

extra_data = []  #

# Moth
if EXTRA_1:
    with DATA_DIR.joinpath("pii_dataset_fixed.json").open("r") as f:
        external_1 = json.load(f)
    print("external_1 datapoints: ", len(external_1))
    extra_data.extend(external_1)

# PJMathmatician
if EXTRA_2:
    with DATA_DIR.joinpath("moredata_dataset_fixed.json").open("r") as f:
        external_2 = json.load(f)
    print("external_2 datapoints: ", len(external_2))
    extra_data.extend(external_2)

# Nicholas
if EXTRA_3:
    with DATA_DIR.joinpath("mixtral-8x7b-v1.json").open("r") as f:
        external_3 = json.load(f)
    print("external_3 datapoints: ", len(external_3))
    extra_data.extend(external_3)

print("len(extra_data): {len(extra_data)}")

external_1 datapoints:  4434
external_2 datapoints:  2000
len(extra_data): {len(extra_data)}


In [10]:
all_labels = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'O'
]
id2label = {i: l for i, l in enumerate(all_labels)}
label2id = {v: k for k, v in id2label.items()}
target = [l for l in all_labels if l != "O"]

## Tokenization

In [11]:
class CustomTokenizer:
    def __init__(self, tokenizer: PreTrainedTokenizerBase, label2id: dict, max_length: int) -> None:
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_length = max_length

    def __call__(self, example: dict) -> dict:
        # rebuild text from tokens
        text, labels, token_map = [], [], []

        for idx, (t, l, ws) in enumerate(
            zip(example["tokens"], example["provided_labels"], example["trailing_whitespace"])
        ):
            text.append(t)
            labels.extend([l] * len(t))
            token_map.extend([idx]*len(t))

            if ws:
                text.append(" ")
                labels.append("O")
                token_map.append(-1)

        text = "".join(text)
        labels = np.array(labels)

        # actual tokenization
        tokenized = self.tokenizer(
            "".join(text),
            return_offsets_mapping=True,
            truncation=True,
            max_length=self.max_length
        )

        token_labels = []

        for start_idx, end_idx in tokenized.offset_mapping:
            # CLS token
            if start_idx == 0 and end_idx == 0:
                token_labels.append(self.label2id["O"])
                continue

            # case when token starts with whitespace
            if text[start_idx].isspace():
                start_idx += 1

            token_labels.append(self.label2id[labels[start_idx]])

        length = len(tokenized.input_ids)

        return {**tokenized, "labels": token_labels, "length": length, "token_map": token_map}

## Augmentation

In [12]:
class RandomCutOut:
    def __init__(self, mask_p: float, mask_token_id: int) -> None:
        self.mask_p = mask_p
        self.mask_token_id = mask_token_id

    def __call__(self, batch: dict) -> dict:
        if self.mask_p == 0:
            return batch
        new_input_ids_list = []
        for input_ids in batch["input_ids"]:
            mask = np.random.binomial(1, p=self.mask_p, size=(len(input_ids),))
            new_input_ids = np.where(mask, self.mask_token_id, input_ids)
            new_input_ids_list.append(new_input_ids.tolist())
        batch["input_ids"] = new_input_ids_list
        return batch

## Instantiate the dataset

In [13]:
tokenizer = DebertaV2TokenizerFast.from_pretrained(TOKENIZER_CHECKPOINT)
train_encoder = CustomTokenizer(tokenizer=tokenizer, label2id=label2id, max_length=TRAINING_MAX_LENGTH)
eval_encoder = CustomTokenizer(tokenizer=tokenizer, label2id=label2id, max_length=EVAL_MAX_LENGTH)
train_augmentation = RandomCutOut(mask_p=MASK_P, mask_token_id=tokenizer.mask_token_id)

ds = DatasetDict()

for key, data in zip(["original", "extra"], [original_data, extra_data]):
    ds[key] = Dataset.from_dict({
        "full_text": [x["full_text"] for x in data],
        "document": [str(x["document"]) for x in data],
        "tokens": [x["tokens"] for x in data],
        "trailing_whitespace": [x["trailing_whitespace"] for x in data],
        "provided_labels": [x["labels"] for x in data],
    })

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]



## Metrics

In [14]:
class MetricsComputer:
    def __init__(self, all_labels: list[str], beta: float = 5.0) -> None:
        self.all_labels = all_labels
        self.beta = beta

    def __call__(self, preds: EvalPrediction) -> dict[str, float]:
        predictions, labels = preds
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [self.all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [self.all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        recall = recall_score(true_labels, true_predictions)
        precision = precision_score(true_labels, true_predictions)
        f5_score = (1 + self.beta ** 2) * recall * precision / ((self.beta ** 2) * precision + recall)

        results = {
            'recall': recall,
            'precision': precision,
            'f5': f5_score
        }
        return results

# compute_metrics = MetricsComputer(all_labels=all_labels)

In [15]:
def find_span(target: list[str], document: list[str]) -> list[list[int]]:
    idx = 0
    spans = []
    span = []

    for i, token in enumerate(document):
        if token != target[idx]:
            idx = 0
            span = []
            continue
        span.append(i)
        idx += 1
        if idx == len(target):
            spans.append(span)
            span = []
            idx = 0
            continue

    return spans

In [16]:
class PRFScore:
    """A precision / recall / F score."""

    def __init__(
        self,
        *,
        tp: int = 0,
        fp: int = 0,
        fn: int = 0,
    ) -> None:
        self.tp = tp
        self.fp = fp
        self.fn = fn

    def __len__(self) -> int:
        return self.tp + self.fp + self.fn

    def __iadd__(self, other):  # in-place add
        self.tp += other.tp
        self.fp += other.fp
        self.fn += other.fn
        return self

    def __add__(self, other):
        return PRFScore(
            tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
        )

    def score_set(self, cand: set, gold: set) -> None:
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
        self.fn += len(gold - cand)

    @property
    def precision(self) -> float:
        return self.tp / (self.tp + self.fp + 1e-100)

    @property
    def recall(self) -> float:
        return self.tp / (self.tp + self.fn + 1e-100)

    @property
    def f1(self) -> float:
        p = self.precision
        r = self.recall
        return 2 * ((p * r) / (p + r + 1e-100))

    @property
    def f5(self) -> float:
        beta = 5
        p = self.precision
        r = self.recall

        fbeta = (1+(beta**2))*p*r / ((beta**2)*p + r + 1e-100)
        return fbeta

    def to_dict(self) -> dict[str, float]:
        return {"p": self.precision, "r": self.recall, "f5": self.f5}


class MetricsComputerV2:
    nlp = English()

    def __init__(self, eval_ds: Dataset, label2id: dict, conf_thresh: float = 0.9) -> None:
        self.ds = eval_ds.remove_columns("labels").rename_columns({"provided_labels": "labels"})
        self.gt_df = self.create_gt_df(self.ds)
        self.label2id = label2id
        self.confth = conf_thresh
        self._search_gt()

    def __call__(self, eval_preds: EvalPrediction) -> dict:
        pred_df = self.create_pred_df(eval_preds.predictions)
        return self.compute_metrics_from_df(self.gt_df, pred_df)

    def _search_gt(self) -> None:
        email_regex = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
        phone_num_regex = re.compile(r"(\(\d{3}\)\d{3}\-\d{4}\w*|\d{3}\.\d{3}\.\d{4})\s")
        self.emails = []
        self.phone_nums = []

        for _data in self.ds:
            # email
            for token_idx, token in enumerate(_data["tokens"]):
                if re.fullmatch(email_regex, token) is not None:
                    self.emails.append(
                        {"document": _data["document"], "token": token_idx, "label": "B-EMAIL", "token_str": token}
                    )
            # phone number
            matches = phone_num_regex.findall(_data["full_text"])
            if not matches:
                continue
            for match in matches:
                target = [t.text for t in self.nlp.tokenizer(match)]
                matched_spans = find_span(target, _data["tokens"])
            for matched_span in matched_spans:
                for intermediate, token_idx in enumerate(matched_span):
                    prefix = "I" if intermediate else "B"
                    self.phone_nums.append(
                        {"document": _data["document"], "token": token_idx, "label": f"{prefix}-PHONE_NUM", "token_str": _data["tokens"][token_idx]}
                    )

    @staticmethod
    def create_gt_df(ds: Dataset):
        gt = []
        for row in ds:
            for token_idx, (token, label) in enumerate(zip(row["tokens"], row["labels"])):
                if label == "O":
                    continue
                gt.append(
                    {"document": row["document"], "token": token_idx, "label": label, "token_str": token}
                )
        gt_df = pd.DataFrame(gt)
        gt_df["row_id"] = gt_df.index

        return gt_df

    def create_pred_df(self, prediction: np.ndarray) -> pd.DataFrame:
        ### construct prediction df
        o_index = self.label2id["O"]
        preds = prediction.argmax(-1)
        preds_without_o = prediction[:,:,:o_index].argmax(-1)
        o_preds = prediction[:,:,o_index]
        preds_final = np.where(o_preds < self.confth, preds_without_o , preds)

        triplets = set()
        processed = []

        # Iterate over document
        for p_doc, token_map, offsets, tokens, doc in zip(
            preds_final, self.ds["token_map"], self.ds["offset_mapping"], self.ds["tokens"], self.ds["document"]
        ):
            # Iterate over sequence
            for p_token, (start_idx, end_idx) in zip(p_doc, offsets):
                label_pred = id2label[p_token]

                if start_idx + end_idx == 0:
                    # [CLS] token i.e. BOS
                    continue

                if token_map[start_idx] == -1:
                    start_idx += 1

                # ignore "\n\n"
                while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                    start_idx += 1

                if start_idx >= len(token_map):
                    break

                token_id = token_map[start_idx]
                triplet = (label_pred, token_id, tokens[token_id])

                # ignore "O", preds, phone number and  email
                if label_pred in ("O", "B-EMAIL", "B-PHONE_NUM", "B-PHONE_NUM") or token_id == -1:
                    continue

                if triplet in triplets:
                    continue

                processed.append(
                    {"document": doc, "token": token_id, "label": label_pred, "token_str": tokens[token_id]}
                )
                triplets.add(triplet)

        pred_df = pd.DataFrame(processed + self.emails + self.phone_nums)
        pred_df["row_id"] = list(range(len(pred_df)))

        return pred_df

    def compute_metrics_from_df(self, gt_df, pred_df):
        """
        Compute the LB metric (lb) and other auxiliary metrics
        """

        references = {(row.document, row.token, row.label) for row in gt_df.itertuples()}
        predictions = {(row.document, row.token, row.label) for row in pred_df.itertuples()}

        score_per_type = defaultdict(PRFScore)
        references = set(references)

        for ex in predictions:
            pred_type = ex[-1] # (document, token, label)
            if pred_type != 'O':
                pred_type = pred_type[2:] # avoid B- and I- prefix

            if pred_type not in score_per_type:
                score_per_type[pred_type] = PRFScore()

            if ex in references:
                score_per_type[pred_type].tp += 1
                references.remove(ex)
            else:
                score_per_type[pred_type].fp += 1

        for doc, tok, ref_type in references:
            if ref_type != 'O':
                ref_type = ref_type[2:] # avoid B- and I- prefix

            if ref_type not in score_per_type:
                score_per_type[ref_type] = PRFScore()
            score_per_type[ref_type].fn += 1

        totals = PRFScore()

        for prf in score_per_type.values():
            totals += prf

        return {
            "precision": totals.precision,
            "recall": totals.recall,
            "f5": totals.f5,
            **{
                f"{v_k}-{k}": v_v
                for k in set([l[2:] for l in self.label2id.keys() if l!= 'O'])
                for v_k, v_v in score_per_type[k].to_dict().items()
            },
        }

## Model

In [22]:
class ModelInit:
    def __init__(
        self,
        checkpoint: str,
        id2label: dict,
        label2id: dict,
        freeze_embedding: bool,
        freeze_layers: int,
    ) -> None:
        self.model = DebertaV2ForTokenClassification.from_pretrained(
            checkpoint,
            num_labels=len(id2label),
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes=True,
            use_safetensors=True,
            token="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"
        )
        for param in self.model.deberta.embeddings.parameters():
            param.requires_grad = False if freeze_embedding else True
        for layer in self.model.deberta.encoder.layer[:freeze_layers]:
            for param in layer.parameters():
                param.requires_grad = False
        self.weight = copy.deepcopy(self.model.state_dict())

    def __call__(self) -> DebertaV2ForTokenClassification:
        self.model.load_state_dict(self.weight)
        return self.model

## Split

In [18]:
# split according to document id
folds = [
    (
        np.array([i for i, d in enumerate(ds["original"]["document"]) if int(d) % N_SPLITS != s]),
        np.array([i for i, d in enumerate(ds["original"]["document"]) if int(d) % N_SPLITS == s])
    )
    for s in range(N_SPLITS)
]

exclude_indices = []
if FILTER_ORIGINAL:
    negative_idxs = [i for i, labels in enumerate(ds["original"]["provided_labels"]) if not any(np.array(labels) != "O")]
    exclude_indices = negative_idxs[len(negative_idxs)//3:]

## Trainer with custom loss

In [19]:
class FocalLoss(nn.Module):
    def __init__(
        self, weight = None, gamma = 2., reduction = "mean", ignore_index: int = -100
    ) -> None:
        super().__init__()
        self.ce = nn.CrossEntropyLoss(weight=weight, reduction="none", ignore_index=ignore_index)
        self.ignore_index = ignore_index
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, logits, target):
        logits = logits.permute(0, 2, 1)  # b, seq, c -> b, c, seq
        mask = target != self.ignore_index
        ce_loss = self.ce(logits, target)
        pt = torch.exp(-ce_loss)
        f_loss = (1 - pt) ** self.gamma * ce_loss

        if self.reduction == "mean":
            f_loss = torch.mean(torch.masked_select(f_loss, mask))
        elif self.reduction == "sum":
            f_loss = torch.sum(torch.masked_select(f_loss, mask))
        else:
            f_loss = torch.where(mask, f_loss, 0)

        return f_loss

In [20]:
loss_fn = FocalLoss(gamma=GAMMA)

class FocalLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        _, outputs = super().compute_loss(model, inputs, True)
        labels = inputs["labels"]
        loss = loss_fn(logits=outputs["logits"], target=labels)
        outputs["loss"] = loss
        return (loss, outputs) if return_outputs else loss

## Train

#### CV

In [None]:
for fold_idx, (train_idx, eval_idx) in enumerate(folds):
    args.run_name = f"fold-{fold_idx}"
    args.output_dir = os.path.join(OUTPUT_DIR, f"fold_{fold_idx}")
    if Path(args.output_dir).joinpath("eval_result.json").exists():
        continue
    model_init = ModelInit(
        MODEL_CHECKPOINT_SUFFIX + f"fold-{fold_idx}",
        id2label=id2label,
        label2id=label2id,
        freeze_embedding=FREEZE_EMBEDDING,
        freeze_layers=FREEZE_LAYERS,
    )
    original_ds = ds["original"].select([i for i in train_idx if i not in exclude_indices])
    train_ds = concatenate_datasets([original_ds, ds["extra"]])
    train_ds = train_ds.map(train_encoder, num_proc=os.cpu_count())
    train_ds.set_transform(train_augmentation)
    eval_ds = ds["original"].select(eval_idx)
    eval_ds = eval_ds.map(eval_encoder, num_proc=os.cpu_count())
    trainer = FocalLossTrainer(
        args=args,
        model_init=model_init,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        compute_metrics=MetricsComputerV2(eval_ds=eval_ds, label2id=label2id),
        data_collator=DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16),
    )
    trainer.train()
    eval_res = trainer.evaluate(eval_dataset=eval_ds)
    with open(os.path.join(args.output_dir, "eval_result.json"), "w") as f:
        json.dump(eval_res, f)
    del trainer, model_init
    gc.collect()
    torch.cuda.empty_cache()
    wandb.finish()

config.json:   0%|          | 0.00/866 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/568M [00:00<?, ?B/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at emiz6413/deberta-v3-small_mlm_original-only_fold-1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


         

#0:   0%|          | 0/1076 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/1075 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/1075 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/1075 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/1075 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/1075 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/1075 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/1075 [00:00<?, ?ex/s]

         

#0:   0%|          | 0/215 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/215 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/214 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/214 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/214 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/214 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/214 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/214 [00:00<?, ?ex/s]

Step,Training Loss,Validation Loss,Precision,Recall,F5,P-email,R-email,F5-email,P-username,R-username,F5-username,P-street Address,R-street Address,F5-street Address,P-url Personal,R-url Personal,F5-url Personal,P-name Student,R-name Student,F5-name Student,P-phone Num,R-phone Num,F5-phone Num,P-id Num,R-id Num,F5-id Num
50,0.2245,0.025557,0.956522,0.03022,0.031389,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
100,0.0652,0.008823,0.341253,0.651099,0.629129,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.276316,0.636364,0.605993,0.354733,0.667183,0.645321,1.0,1.0,1.0,0.0,0.0,0.0
150,0.0291,0.004456,0.534423,0.853022,0.833901,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266129,1.0,0.90411,0.565482,0.862229,0.845171,0.8,1.0,0.990476,0.818182,0.333333,0.341108
200,0.0172,0.003158,0.580734,0.869505,0.853188,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.405405,0.909091,0.867631,0.595767,0.871517,0.856274,0.8,1.0,0.990476,0.642857,0.666667,0.665718
250,0.0146,0.002527,0.660638,0.853022,0.843574,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.909091,0.881356,0.671856,0.868421,0.858758,0.8,1.0,0.990476,1.0,0.296296,0.304539
300,0.0108,0.002265,0.666317,0.872253,0.862006,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.566038,0.909091,0.888383,0.671819,0.874613,0.864575,0.8,1.0,0.990476,0.75,0.666667,0.669528
350,0.0091,0.003057,0.581786,0.903846,0.885003,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.478261,1.0,0.959732,0.588057,0.899381,0.881433,0.8,1.0,0.990476,0.611111,0.814815,0.804501
400,0.0109,0.00251,0.611578,0.899725,0.883711,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.545455,0.909091,0.886364,0.619251,0.896285,0.881124,1.0,1.0,1.0,0.648649,0.888889,0.876404
450,0.0117,0.002862,0.656344,0.902473,0.889641,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.471429,1.0,0.958659,0.675613,0.896285,0.885165,0.8,1.0,0.990476,0.621622,0.851852,0.839888
500,0.0073,0.002685,0.600182,0.90522,0.887864,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.561404,0.969697,0.943311,0.601442,0.904025,0.886864,0.8,1.0,0.990476,0.6,0.777778,0.769014


0,1
eval/f5,▁▆███████████████
eval/f5-EMAIL,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/f5-ID_NUM,▁▁▄▆▃▆▇██▇▇▇▇▇███
eval/f5-NAME_STUDENT,▁▆███████████████
eval/f5-PHONE_NUM,██▁▁▁▁▁█▁▁▁▁▁▁▁▁▁
eval/f5-STREET_ADDRESS,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/f5-URL_PERSONAL,▁▅█▇▇▇█▇███▇█████
eval/f5-USERNAME,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/loss,█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/p-EMAIL,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/f5,0.89319
eval/f5-EMAIL,1.0
eval/f5-ID_NUM,0.83989
eval/f5-NAME_STUDENT,0.88986
eval/f5-PHONE_NUM,0.99048
eval/f5-STREET_ADDRESS,0.0
eval/f5-URL_PERSONAL,0.94118
eval/f5-USERNAME,0.0
eval/loss,0.00235
eval/p-EMAIL,1.0


config.json:   0%|          | 0.00/866 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/568M [00:00<?, ?B/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at emiz6413/deberta-v3-small_mlm_original-only_fold-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


         

#0:   0%|          | 0/1078 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/1078 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/1078 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/1078 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/1078 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/1078 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/1078 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/1078 [00:00<?, ?ex/s]

         

#0:   0%|          | 0/212 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/211 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/211 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/211 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/211 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/211 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/211 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/211 [00:00<?, ?ex/s]

Step,Training Loss,Validation Loss,Precision,Recall,F5,P-email,R-email,F5-email,P-username,R-username,F5-username,P-street Address,R-street Address,F5-street Address,P-url Personal,R-url Personal,F5-url Personal,P-name Student,R-name Student,F5-name Student,P-phone Num,R-phone Num,F5-phone Num,P-id Num,R-id Num,F5-id Num
50,0.2352,0.025898,0.9,0.012195,0.012676,0.888889,1.0,0.995215,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
100,0.0585,0.007646,0.499353,0.523035,0.522083,0.888889,1.0,0.995215,0.0,0.0,0.0,0.19403,0.590909,0.547812,0.338983,0.714286,0.685112,0.540881,0.520424,0.521182,0.5,1.0,0.962963,0.0,0.0,0.0
150,0.0253,0.002577,0.632231,0.829268,0.819446,0.888889,1.0,0.995215,0.6,0.75,0.742857,0.714286,0.681818,0.683012,0.469388,0.821429,0.798398,0.633448,0.836611,0.826417,0.5,1.0,0.962963,1.0,0.642857,0.651811
200,0.0155,0.003042,0.52117,0.917344,0.891286,0.888889,1.0,0.995215,0.272727,0.75,0.702703,0.714286,0.681818,0.683012,0.356164,0.928571,0.874515,0.521814,0.922844,0.896349,1.0,1.0,1.0,0.933333,1.0,0.99726
250,0.011,0.002354,0.577257,0.901084,0.882053,0.888889,1.0,0.995215,0.444444,1.0,0.954128,0.789474,0.681818,0.685413,0.511111,0.821429,0.802685,0.569668,0.909228,0.888851,1.0,1.0,1.0,0.928571,0.928571,0.928571
300,0.0105,0.002727,0.599628,0.872629,0.857611,0.888889,1.0,0.995215,0.5,1.0,0.962963,0.833333,0.681818,0.68662,0.9,0.642857,0.65,0.583085,0.886536,0.869139,1.0,1.0,1.0,0.923077,0.857143,0.859504
350,0.009,0.002569,0.607209,0.890244,0.874565,0.888889,1.0,0.995215,0.285714,0.5,0.485981,0.882353,0.681818,0.687831,0.512821,0.714286,0.703654,0.601005,0.90469,0.887443,1.0,1.0,1.0,0.928571,0.928571,0.928571
400,0.0092,0.002211,0.714286,0.860434,0.853715,0.888889,1.0,0.995215,0.4,1.0,0.945455,0.833333,0.681818,0.68662,0.428571,0.964286,0.920052,0.733161,0.856278,0.850783,1.0,1.0,1.0,0.875,1.0,0.994536
450,0.0065,0.002163,0.620148,0.909214,0.893201,0.888889,1.0,0.995215,0.4,1.0,0.945455,0.789474,0.681818,0.685413,0.473684,0.964286,0.927345,0.621259,0.910741,0.894707,1.0,1.0,1.0,0.823529,1.0,0.991826
500,0.0084,0.001992,0.633776,0.905149,0.890484,0.888889,1.0,0.995215,0.444444,1.0,0.954128,0.833333,0.681818,0.68662,0.586957,0.964286,0.941019,0.627883,0.906203,0.891012,1.0,1.0,1.0,0.823529,1.0,0.991826


0,1
eval/f5,▁▅▇██████████████
eval/f5-EMAIL,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/f5-ID_NUM,▁▁▆██▇███████████
eval/f5-NAME_STUDENT,▁▅▇██████████████
eval/f5-PHONE_NUM,█▁▁██████████████
eval/f5-STREET_ADDRESS,▁▇███████████████
eval/f5-URL_PERSONAL,▁▆▇█▇▆▆██████▇▇▇▇
eval/f5-USERNAME,▁▁▆▆██▅██████████
eval/loss,█▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/p-EMAIL,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/f5,0.88833
eval/f5-EMAIL,0.99522
eval/f5-ID_NUM,0.99726
eval/f5-NAME_STUDENT,0.89155
eval/f5-PHONE_NUM,1.0
eval/f5-STREET_ADDRESS,0.68783
eval/f5-URL_PERSONAL,0.86898
eval/f5-USERNAME,0.95413
eval/loss,0.00175
eval/p-EMAIL,0.88889


config.json:   0%|          | 0.00/866 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/568M [00:00<?, ?B/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at emiz6413/deberta-v3-small_mlm_original-only_fold-3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


         

#0:   0%|          | 0/1077 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/1076 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/1076 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/1076 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/1076 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/1076 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/1076 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/1076 [00:00<?, ?ex/s]

         

#0:   0%|          | 0/214 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/214 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/213 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/213 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/213 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/213 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/213 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/213 [00:00<?, ?ex/s]

#### Train with full data

In [None]:
original_ds = ds["original"].select([i for i in range(len(ds["original"])) if i not in exclude_indices])
train_ds = concatenate_datasets([original_ds, ds["extra"]])
train_ds = train_ds.map(train_encoder, num_proc=os.cpu_count())
train_ds.set_transform(train_augmentation)
args.evaluation_strategy = "no"
args.save_strategy = "no"
args.run_name = f"all_data"
model_init = ModelInit(
        MODEL_CHECKPOINT_SUFFIX + "all",
        id2label=id2label,
        label2id=label2id,
        freeze_embedding=FREEZE_EMBEDDING,
        freeze_layers=FREEZE_LAYERS,
    )
trainer = FocalLossTrainer(
    args=args,
    model_init=model_init,
    train_dataset=train_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16),
)
trainer.train()
trainer.save_model(os.path.join(OUTPUT_DIR, "all"))

# log cv result
results = dict()
for res_json_path in Path(OUTPUT_DIR).glob("fold*/eval_result.json"):
    fold = res_json_path.parent.name.split("_")[-1]
    with open(res_json_path, "r") as f:
        res = json.load(f)
        results[fold] = {k.replace("eval_", ""): v for k, v in res.items()}
results["cv"] = {key: np.mean([r[key] for r in results.values()]) for key in results["0"].keys()}
table = wandb.Table(columns=["fold"] + list(results["0"].keys()))
for f, res in results.items():
    table.add_data(f, *[res[c] for c in table.columns if c != "fold"])
wandb.log({"eval_result": table})

wandb.finish()