In [1]:
%env WANDB_PROJECT=PII
%env WANDB_RUN_GROUP=bigbird-base-3072-filter+T-CE

env: WANDB_PROJECT=PII
env: WANDB_RUN_GROUP=bigbird-base-3072-filter+T-CE


In [2]:
!pip install -q seqeval evaluate

[0m

In [3]:
import json
import copy
import gc
import os
import re
from collections import defaultdict
from pathlib import Path

import torch
from torch import Tensor, nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from spacy.lang.en import English
from transformers.tokenization_utils import PreTrainedTokenizerBase
from transformers.models.big_bird import BigBirdForTokenClassification, BigBirdTokenizerFast
from transformers.trainer import Trainer
from transformers.training_args import TrainingArguments
from transformers.trainer_utils import EvalPrediction, PredictionOutput
from transformers.data.data_collator import DataCollatorForTokenClassification
from sklearn.model_selection import KFold
from datasets import Dataset, DatasetDict, concatenate_datasets
from seqeval.metrics import recall_score, precision_score
import wandb

In [4]:
wandb.login(key="eff994fe72307679c21248b6e7859e26960b8db7")

[34m[1mwandb[0m: Currently logged in as: [33memiz6413[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
DATA_DIR = Path("../dataset/")
OUTPUT_DIR = "output"
Path(OUTPUT_DIR).mkdir(exist_ok=True)

In [6]:
TRAINING_MODEL_PATH = "google/bigbird-roberta-base"
TRAINING_MAX_LENGTH = 3072 if "tiny-random" not in TRAINING_MODEL_PATH else 512
EVAL_MAX_LENGTH = 3072 if "tiny-random" not in TRAINING_MODEL_PATH else 512
CONF_THRESH = 0.9
LR = 2.5e-5  # 1.5e-5 ~ 3e-5 for base # 5e-6 ~ 1e-5 for large
LR_SCHEDULER_TYPE = "linear"
NUM_EPOCHS = 3 if "tiny-random" not in TRAINING_MODEL_PATH else 0.1
BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
GRAD_ACCUMULATION_STEPS = 16 // BATCH_SIZE
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01
FREEZE_EMBEDDING = False
FREEZE_LAYERS = 0
GAMMA = 0
MASK_P = 0
# training data
N_SPLITS = 4
FILTER_ORIGINAL = True
MOTH = False
PJMATHMATICIAN = False
NICHOLAS = False
MPWARE = False
TONYAROBERTSON = True

In [7]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    fp16=torch.cuda.is_available(),
    learning_rate=LR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
    report_to="wandb",
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=1,
    logging_steps=10,
    metric_for_best_model="f5",
    greater_is_better=True,
    load_best_model_at_end=True,
    overwrite_output_dir=True,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=WEIGHT_DECAY,
)

## Load dataset

In [8]:
with DATA_DIR.joinpath("train.json").open("r") as f:
    original_data = json.load(f)

extra_data = []  #

if MOTH:
    with DATA_DIR.joinpath("pii_dataset_fixed.json").open("r") as f:
        external = json.load(f)
    print("Moth's datapoints: ", len(external))
    extra_data.extend(external)

if PJMATHMATICIAN:
    with DATA_DIR.joinpath("moredata_dataset_fixed.json").open("r") as f:
        external = json.load(f)
    print("PJMathmatician's datapoints: ", len(external))
    extra_data.extend(external)

if NICHOLAS:
    with DATA_DIR.joinpath("mixtral-8x7b-v1.json").open("r") as f:
        external = json.load(f)
    print("Nicholas' datapoints: ", len(external))
    extra_data.extend(external)
    
if MPWARE:
    with DATA_DIR.joinpath("mpware_mixtral8x7b_v1.1.json").open("r") as f:
        external = json.load(f)
    print("MPWARE's datapoints: ", len(external))
    extra_data.extend(external)
    
if TONYAROBERTSON:
    with DATA_DIR.joinpath("Fake_data_1850_218.json").open("r") as f:
        external = json.load(f)
    print("tonyarobertson's datapoints: ", len(external))
    extra_data.extend(external)
    
print(f"len(extra_data): {len(extra_data)}")

tonyarobertson's datapoints:  1850
len(extra_data): 1850


In [9]:
all_labels = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'O'
]
id2label = {i: l for i, l in enumerate(all_labels)}
label2id = {v: k for k, v in id2label.items()}
target = [l for l in all_labels if l != "O"]

## Tokenization

In [10]:
class CustomTokenizer:
    def __init__(self, tokenizer: PreTrainedTokenizerBase, label2id: dict, max_length: int) -> None:
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_length = max_length

    def __call__(self, example: dict) -> dict:
        # rebuild text from tokens
        text, labels, token_map = [], [], []

        for idx, (t, l, ws) in enumerate(
            zip(example["tokens"], example["provided_labels"], example["trailing_whitespace"])
        ):
            text.append(t)
            labels.extend([l] * len(t))
            token_map.extend([idx]*len(t))

            if ws:
                text.append(" ")
                labels.append("O")
                token_map.append(-1)

        text = "".join(text)
        labels = np.array(labels)

        # actual tokenization
        tokenized = self.tokenizer(
            "".join(text),
            return_offsets_mapping=True,
            truncation=True,
            max_length=self.max_length
        )

        token_labels = []

        for start_idx, end_idx in tokenized.offset_mapping:
            # CLS token
            if start_idx == 0 and end_idx == 0:
                token_labels.append(self.label2id["O"])
                continue

            # case when token starts with whitespace
            if text[start_idx].isspace():
                start_idx += 1
            
            try:
                token_labels.append(self.label2id[labels[start_idx]])
            except:
                continue

        length = len(tokenized.input_ids)

        return {**tokenized, "labels": token_labels, "length": length, "token_map": token_map}

## Instanciate the dataset

In [11]:
tokenizer = BigBirdTokenizerFast.from_pretrained(TRAINING_MODEL_PATH)
train_encoder = CustomTokenizer(tokenizer=tokenizer, label2id=label2id, max_length=TRAINING_MAX_LENGTH)
eval_encoder = CustomTokenizer(tokenizer=tokenizer, label2id=label2id, max_length=EVAL_MAX_LENGTH)

ds = DatasetDict()

for key, data in zip(["original", "extra"], [original_data, extra_data]):
    ds[key] = Dataset.from_dict({
        "full_text": [x["full_text"] for x in data],
        "document": [str(x["document"]) for x in data],
        "tokens": [x["tokens"] for x in data],
        "trailing_whitespace": [x["trailing_whitespace"] for x in data],
        "provided_labels": [x["labels"] for x in data],
    })

## Metrics

In [12]:
def find_span(target: list[str], document: list[str]) -> list[list[int]]:
    idx = 0
    spans = []
    span = []

    for i, token in enumerate(document):
        if token != target[idx]:
            idx = 0
            span = []
            continue
        span.append(i)
        idx += 1
        if idx == len(target):
            spans.append(span)
            span = []
            idx = 0
            continue
    
    return spans


class PRFScore:
    """A precision / recall / F score."""

    def __init__(
        self,
        *,
        tp: int = 0,
        fp: int = 0,
        fn: int = 0,
    ) -> None:
        self.tp = tp
        self.fp = fp
        self.fn = fn

    def __len__(self) -> int:
        return self.tp + self.fp + self.fn

    def __iadd__(self, other):  # in-place add
        self.tp += other.tp
        self.fp += other.fp
        self.fn += other.fn
        return self

    def __add__(self, other):
        return PRFScore(
            tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
        )

    def score_set(self, cand: set, gold: set) -> None:
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
        self.fn += len(gold - cand)

    @property
    def precision(self) -> float:
        return self.tp / (self.tp + self.fp + 1e-100)

    @property
    def recall(self) -> float:
        return self.tp / (self.tp + self.fn + 1e-100)

    @property
    def f1(self) -> float:
        p = self.precision
        r = self.recall
        return 2 * ((p * r) / (p + r + 1e-100))

    @property
    def f5(self) -> float:
        beta = 5
        p = self.precision
        r = self.recall

        fbeta = (1+(beta**2))*p*r / ((beta**2)*p + r + 1e-100)
        return fbeta

    def to_dict(self) -> dict[str, float]:
        return {"p": self.precision, "r": self.recall, "f5": self.f5}


class MetricsComputerV2:
    nlp = English()

    def __init__(self, eval_ds: Dataset, label2id: dict, conf_thresh: float = 0.9) -> None:
        self.ds = eval_ds.remove_columns("labels").rename_columns({"provided_labels": "labels"})
        self.gt_df = self.create_gt_df(self.ds)
        self.label2id = label2id
        self.confth = conf_thresh
        self._search_gt()
        
    def __call__(self, eval_preds: EvalPrediction) -> dict:
        pred_df = self.create_pred_df(eval_preds.predictions)
        return self.compute_metrics_from_df(self.gt_df, pred_df)
    
    def _search_gt(self) -> None:
        email_regex = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
        phone_num_regex = re.compile(r"(\(\d{3}\)\d{3}\-\d{4}\w*|\d{3}\.\d{3}\.\d{4})\s")
        self.emails = []
        self.phone_nums = []

        for _data in self.ds:
            # email
            for token_idx, token in enumerate(_data["tokens"]):
                if re.fullmatch(email_regex, token) is not None:
                    self.emails.append(
                        {"document": _data["document"], "token": token_idx, "label": "B-EMAIL", "token_str": token}
                    )
            # phone number
            matches = phone_num_regex.findall(_data["full_text"])
            if not matches:
                continue
            for match in matches:
                target = [t.text for t in self.nlp.tokenizer(match)]
                matched_spans = find_span(target, _data["tokens"])
            for matched_span in matched_spans:
                for intermediate, token_idx in enumerate(matched_span):
                    prefix = "I" if intermediate else "B"
                    self.phone_nums.append(
                        {"document": _data["document"], "token": token_idx, "label": f"{prefix}-PHONE_NUM", "token_str": _data["tokens"][token_idx]}
                    )
        
    @staticmethod
    def create_gt_df(ds: Dataset):
        gt = []
        for row in ds:
            for token_idx, (token, label) in enumerate(zip(row["tokens"], row["labels"])):
                if label == "O":
                    continue
                gt.append(
                    {"document": row["document"], "token": token_idx, "label": label, "token_str": token}
                )
        gt_df = pd.DataFrame(gt)
        gt_df["row_id"] = gt_df.index
        
        return gt_df
    
    def create_pred_df(self, prediction: np.ndarray) -> pd.DataFrame:
        ### construct prediction df
        o_index = self.label2id["O"]
        preds = prediction.argmax(-1)
        preds_without_o = prediction[:,:,:o_index].argmax(-1)
        o_preds = prediction[:,:,o_index]
        preds_final = np.where(o_preds < self.confth, preds_without_o , preds)

        pairs = set()
        processed = []

        # Iterate over document
        for p_doc, token_map, offsets, tokens, doc in zip(
            preds_final, self.ds["token_map"], self.ds["offset_mapping"], self.ds["tokens"], self.ds["document"]
        ):
            # Iterate over sequence
            for p_token, (start_idx, end_idx) in zip(p_doc, offsets):
                label_pred = id2label[p_token]

                if start_idx + end_idx == 0:
                    # [CLS] token i.e. BOS
                    continue

                if token_map[start_idx] == -1:
                    start_idx += 1

                # ignore "\n\n"
                while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                    start_idx += 1

                if start_idx >= len(token_map): 
                    break

                token_id = token_map[start_idx]
                pair = (doc, token_id)

                # ignore "O", preds, phone number and  email
                if label_pred in ("O", "B-EMAIL", "B-PHONE_NUM", "I-PHONE_NUM") or token_id == -1:
                    continue   

                if pair in pairs:
                    continue

                processed.append(
                    {"document": doc, "token": token_id, "label": label_pred, "token_str": tokens[token_id]}
                )
                pairs.add(pair)

        pred_df = pd.DataFrame(processed + self.emails + self.phone_nums)
        pred_df["row_id"] = list(range(len(pred_df)))

        return pred_df
        
    def compute_metrics_from_df(self, gt_df, pred_df):
        """
        Compute the LB metric (lb) and other auxiliary metrics
        """

        references = {(row.document, row.token, row.label) for row in gt_df.itertuples()}
        predictions = {(row.document, row.token, row.label) for row in pred_df.itertuples()}

        score_per_type = defaultdict(PRFScore)
        references = set(references)

        for ex in predictions:
            pred_type = ex[-1] # (document, token, label)
            if pred_type != 'O':
                pred_type = pred_type[2:] # avoid B- and I- prefix

            if pred_type not in score_per_type:
                score_per_type[pred_type] = PRFScore()

            if ex in references:
                score_per_type[pred_type].tp += 1
                references.remove(ex)
            else:
                score_per_type[pred_type].fp += 1

        for doc, tok, ref_type in references:
            if ref_type != 'O':
                ref_type = ref_type[2:] # avoid B- and I- prefix

            if ref_type not in score_per_type:
                score_per_type[ref_type] = PRFScore()
            score_per_type[ref_type].fn += 1

        totals = PRFScore()

        for prf in score_per_type.values():
            totals += prf

        return {
            "precision": totals.precision,
            "recall": totals.recall,
            "f5": totals.f5,
            **{
                f"{v_k}-{k}": v_v 
                for k in set([l[2:] for l in self.label2id.keys() if l!= 'O'])
                for v_k, v_v in score_per_type[k].to_dict().items()
            },
        }

## Model

In [13]:
class ModelInit:
    model_class = BigBirdForTokenClassification
    
    def __init__(
        self, 
        checkpoint: str, 
        id2label: dict, 
        label2id: dict,
        freeze_embedding: bool,
        freeze_layers: int,
    ) -> None:
        self.model = self.model_class.from_pretrained(
            checkpoint,
            num_labels=len(id2label),
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes=True
        )
        for param in self.model.base_model.embeddings.parameters():
            param.requires_grad = False if freeze_embedding else True
        for layer in self.model.base_model.encoder.layer[:freeze_layers]:
            for param in layer.parameters():
                param.requires_grad = False
        self.weight = copy.deepcopy(self.model.state_dict())

    def __call__(self) -> model_class:
        self.model.load_state_dict(self.weight)
        return self.model

model_init = ModelInit(
    TRAINING_MODEL_PATH, 
    id2label=id2label, 
    label2id=label2id, 
    freeze_embedding=FREEZE_EMBEDDING, 
    freeze_layers=FREEZE_LAYERS,
)

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BigBirdForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForTokenClassification were no

## Split

In [14]:
# split according to document id
folds = [
    (
        np.array([i for i, d in enumerate(ds["original"]["document"]) if int(d) % N_SPLITS != s]),
        np.array([i for i, d in enumerate(ds["original"]["document"]) if int(d) % N_SPLITS == s])
    )
    for s in range(N_SPLITS)
]

exclude_indices = []
if FILTER_ORIGINAL:
    negative_idxs = [i for i, labels in enumerate(ds["original"]["provided_labels"]) if not any(np.array(labels) != "O")]
    exclude_indices = negative_idxs[len(negative_idxs)//3:]

## Train

In [None]:
for fold_idx, (train_idx, eval_idx) in enumerate(folds):
    args.run_name = f"fold-{fold_idx}"
    args.output_dir = os.path.join(OUTPUT_DIR, f"fold_{fold_idx}")
    if Path(args.output_dir).joinpath("eval_result.json").exists():
        continue
    original_ds = ds["original"].select([i for i in train_idx if i not in exclude_indices])
    train_ds = concatenate_datasets([original_ds, ds["extra"]])
    train_ds = train_ds.map(train_encoder, num_proc=os.cpu_count())
    eval_ds = ds["original"].select(eval_idx)
    eval_ds = eval_ds.map(eval_encoder, num_proc=os.cpu_count())
    trainer = Trainer(
        args=args,
        model_init=model_init,
        train_dataset=train_ds,
        eval_dataset=eval_ds,
        tokenizer=tokenizer,
        compute_metrics=MetricsComputerV2(eval_ds=eval_ds, label2id=label2id),
        data_collator=DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=512),
    )
    trainer.train()
    eval_res = trainer.evaluate(eval_dataset=eval_ds)
    with open(os.path.join(args.output_dir, "eval_result.json"), "w") as f:
        json.dump(eval_res, f)
    del trainer
    gc.collect()
    torch.cuda.empty_cache()
    wandb.finish()

         

#0:   0%|          | 0/502 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/502 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/502 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/502 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/502 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/502 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/502 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/501 [00:00<?, ?ex/s]

         

#0:   0%|          | 0/213 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/213 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/212 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/212 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/212 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/212 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/212 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/212 [00:00<?, ?ex/s]

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BigBirdForTokenClassification.forward` and have been ignored: full_text, token_map, length, document, tokens, provided_labels, offset_mapping, trailing_whitespace. If full_text, token_map, length, document, tokens, provided_labels, offset_mapping, trailing_whitespace are not expected by `BigBirdForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4015
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 753
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666974149023493, max=1.0)…

  torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)


Step,Training Loss,Validation Loss,Precision,Recall,F5,P-id Num,R-id Num,F5-id Num,P-street Address,R-street Address,F5-street Address,P-username,R-username,F5-username,P-email,R-email,F5-email,P-phone Num,R-phone Num,F5-phone Num,P-name Student,R-name Student,F5-name Student,P-url Personal,R-url Personal,F5-url Personal
50,0.1584,0.025468,0.888889,0.012158,0.012637,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0452,0.007559,0.403433,0.142857,0.146496,0.5,0.076923,0.079511,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.530769,0.115385,0.118966,0.223881,0.6,0.563584
150,0.0226,0.005239,0.341487,0.530395,0.519345,1.0,0.615385,0.624625,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.331176,0.513378,0.50274,0.45,0.72,0.703759
200,0.0186,0.00423,0.486486,0.683891,0.673381,1.0,0.461538,0.471299,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.473928,0.683946,0.672485,0.538462,0.84,0.822289
250,0.0128,0.003136,0.647462,0.717325,0.714361,0.525,0.807692,0.791304,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.6768,0.707358,0.706132,0.714286,0.8,0.796325
300,0.0128,0.003752,0.488605,0.81459,0.79421,0.851852,0.884615,0.883309,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.471899,0.814381,0.792266,0.818182,0.72,0.723338
350,0.0086,0.003,0.664103,0.787234,0.78166,0.923077,0.923077,0.923077,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.664275,0.774247,0.769349,0.560976,0.92,0.897898
400,0.0089,0.00268,0.714689,0.768997,0.766756,0.916667,0.846154,0.848665,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.704615,0.765886,0.763333,0.782609,0.72,0.722222
450,0.0076,0.003023,0.594156,0.834347,0.821572,0.888889,0.923077,0.921713,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.582249,0.822742,0.809877,0.657895,1.0,0.980392
500,0.007,0.002862,0.609183,0.826748,0.815545,0.884615,0.884615,0.884615,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.600985,0.816054,0.804974,0.581395,1.0,0.973054


The following columns in the evaluation set don't have a corresponding argument in `BigBirdForTokenClassification.forward` and have been ignored: full_text, token_map, length, document, tokens, provided_labels, offset_mapping, trailing_whitespace. If full_text, token_map, length, document, tokens, provided_labels, offset_mapping, trailing_whitespace are not expected by `BigBirdForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1698
  Batch size = 8
Saving model checkpoint to output/fold_0/checkpoint-50
Configuration saved in output/fold_0/checkpoint-50/config.json
Model weights saved in output/fold_0/checkpoint-50/pytorch_model.bin
tokenizer config file saved in output/fold_0/checkpoint-50/tokenizer_config.json
Special tokens file saved in output/fold_0/checkpoint-50/special_tokens_map.json
Deleting older checkpoint [output/fold_0/checkpoint-100] due to args.save_total_limit
  torch.arange(indices.shape[0] * indices.sha

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f5,▁▂▅▇▇██▇████████
eval/f5-EMAIL,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/f5-ID_NUM,▁▂▆▅▇██▇████████
eval/f5-NAME_STUDENT,▁▂▅▇▇███████████
eval/f5-PHONE_NUM,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/f5-STREET_ADDRESS,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/f5-URL_PERSONAL,▁▅▆▇▇▆▇▆█████▇▇▇
eval/f5-USERNAME,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/loss,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁
eval/p-EMAIL,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/f5,0.82939
eval/f5-EMAIL,1.0
eval/f5-ID_NUM,0.90304
eval/f5-NAME_STUDENT,0.82202
eval/f5-PHONE_NUM,0.0
eval/f5-STREET_ADDRESS,0.0
eval/f5-URL_PERSONAL,0.91437
eval/f5-USERNAME,0.0
eval/loss,0.00275
eval/p-EMAIL,1.0


         

#0:   0%|          | 0/503 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/502 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/502 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/502 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/502 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/502 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/502 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/502 [00:00<?, ?ex/s]

         

#0:   0%|          | 0/215 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/215 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/214 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/214 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/214 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/214 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/214 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/214 [00:00<?, ?ex/s]

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BigBirdForTokenClassification.forward` and have been ignored: full_text, token_map, length, document, tokens, provided_labels, offset_mapping, trailing_whitespace. If full_text, token_map, length, document, tokens, provided_labels, offset_mapping, trailing_whitespace are not expected by `BigBirdForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4017
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 753
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666905063514908, max=1.0)…

  torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)


Step,Training Loss,Validation Loss,Precision,Recall,F5,P-id Num,R-id Num,F5-id Num,P-street Address,R-street Address,F5-street Address,P-username,R-username,F5-username,P-email,R-email,F5-email,P-phone Num,R-phone Num,F5-phone Num,P-name Student,R-name Student,F5-name Student,P-url Personal,R-url Personal,F5-url Personal
50,0.1355,0.025666,1.0,0.03022,0.031391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0465,0.008468,0.365019,0.263736,0.266581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.397436,0.239938,0.243652,0.15,0.454545,0.421622
150,0.0292,0.00515,0.390456,0.494505,0.489489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.366548,0.478328,0.472783,0.557692,0.878788,0.859749
200,0.0197,0.004404,0.566248,0.557692,0.558017,0.522727,0.851852,0.831711,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.617978,0.510836,0.514265,0.534483,0.939394,0.912797
250,0.015,0.003787,0.557328,0.767857,0.756861,0.8,0.444444,0.452174,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.54013,0.770898,0.758435,0.627907,0.818182,0.808756
300,0.0102,0.003697,0.55912,0.837912,0.822145,0.75,0.888889,0.882603,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.544057,0.821981,0.806143,0.540984,1.0,0.968397
350,0.0113,0.003054,0.594727,0.836538,0.823658,0.621622,0.851852,0.839888,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.582155,0.828173,0.814928,0.630435,0.878788,0.865672
400,0.0073,0.002768,0.62845,0.813187,0.804096,0.483871,0.555556,0.552408,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.623529,0.820433,0.810588,0.641026,0.757576,0.752315
450,0.0061,0.002818,0.651397,0.800824,0.79382,0.638889,0.851852,0.841069,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.642494,0.781734,0.775272,0.647059,1.0,0.979452
500,0.0055,0.002869,0.627033,0.847527,0.836218,0.545455,0.666667,0.661017,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.617813,0.848297,0.836297,0.690476,0.878788,0.869666


The following columns in the evaluation set don't have a corresponding argument in `BigBirdForTokenClassification.forward` and have been ignored: full_text, token_map, length, document, tokens, provided_labels, offset_mapping, trailing_whitespace. If full_text, token_map, length, document, tokens, provided_labels, offset_mapping, trailing_whitespace are not expected by `BigBirdForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1714
  Batch size = 8
Saving model checkpoint to output/fold_1/checkpoint-50
Configuration saved in output/fold_1/checkpoint-50/config.json
Model weights saved in output/fold_1/checkpoint-50/pytorch_model.bin
tokenizer config file saved in output/fold_1/checkpoint-50/tokenizer_config.json
Special tokens file saved in output/fold_1/checkpoint-50/special_tokens_map.json
  torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
The following columns in the eva

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f5,▁▃▅▅▇███▇███████
eval/f5-EMAIL,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/f5-ID_NUM,▁▁▁▇▄█▇▅▇▆█▆█▆▆█
eval/f5-NAME_STUDENT,▁▃▅▅▇███▇███████
eval/f5-PHONE_NUM,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/f5-STREET_ADDRESS,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/f5-URL_PERSONAL,▁▄▇█▇█▇▆█▇██████
eval/f5-USERNAME,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/loss,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁
eval/p-EMAIL,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/f5,0.86099
eval/f5-EMAIL,1.0
eval/f5-ID_NUM,0.89779
eval/f5-NAME_STUDENT,0.84933
eval/f5-PHONE_NUM,1.0
eval/f5-STREET_ADDRESS,0.0
eval/f5-URL_PERSONAL,0.97389
eval/f5-USERNAME,0.0
eval/loss,0.00292
eval/p-EMAIL,1.0


         

#0:   0%|          | 0/505 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/505 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/505 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/505 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/505 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/505 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/505 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/505 [00:00<?, ?ex/s]

         

#0:   0%|          | 0/212 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/211 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/211 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/211 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/211 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/211 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/211 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/211 [00:00<?, ?ex/s]

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BigBirdForTokenClassification.forward` and have been ignored: full_text, token_map, length, document, tokens, provided_labels, offset_mapping, trailing_whitespace. If full_text, token_map, length, document, tokens, provided_labels, offset_mapping, trailing_whitespace are not expected by `BigBirdForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4040
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 756
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


  torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)


Step,Training Loss,Validation Loss,Precision,Recall,F5,P-id Num,R-id Num,F5-id Num,P-street Address,R-street Address,F5-street Address,P-username,R-username,F5-username,P-email,R-email,F5-email,P-phone Num,R-phone Num,F5-phone Num,P-name Student,R-name Student,F5-name Student,P-url Personal,R-url Personal,F5-url Personal
50,0.1403,0.02762,0.9,0.012195,0.012676,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.888889,1.0,0.995215,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0561,0.007651,0.395664,0.197832,0.201711,0.0,0.0,0.0,0.333333,0.318182,0.318739,0.0,0.0,0.0,0.888889,1.0,0.995215,1.0,1.0,1.0,0.454167,0.164902,0.169043,0.221053,0.75,0.686792
150,0.0249,0.00575,0.488304,0.452575,0.453852,1.0,0.714286,0.722222,1.0,0.727273,0.734982,0.0,0.0,0.0,0.888889,1.0,0.995215,1.0,1.0,1.0,0.46118,0.449319,0.449764,0.5,0.071429,0.073864
200,0.0262,0.005325,0.420408,0.697832,0.680559,1.0,0.142857,0.147727,0.894737,0.772727,0.776801,0.0,0.0,0.0,0.888889,1.0,0.995215,1.0,1.0,1.0,0.404782,0.717095,0.696429,0.565217,0.464286,0.467497
250,0.0148,0.004143,0.552135,0.753388,0.742972,1.0,0.357143,0.366197,1.0,0.818182,0.823944,0.5,0.25,0.254902,0.888889,1.0,0.995215,1.0,1.0,1.0,0.537155,0.765507,0.753192,0.566667,0.607143,0.605479
300,0.0104,0.003055,0.648276,0.764228,0.759006,1.0,0.928571,0.931129,1.0,0.818182,0.823944,1.0,0.25,0.257426,0.888889,1.0,0.995215,1.0,1.0,1.0,0.634395,0.753404,0.748007,0.581395,0.892857,0.874832
350,0.0087,0.002991,0.652893,0.749322,0.74509,1.0,0.714286,0.722222,1.0,0.909091,0.912281,0.5,0.25,0.254902,0.888889,1.0,0.995215,1.0,1.0,1.0,0.650602,0.73525,0.731589,0.465517,0.964286,0.926121
400,0.0096,0.003532,0.649103,0.784553,0.778306,1.0,1.0,1.0,1.0,0.909091,0.912281,0.333333,0.25,0.252427,0.888889,1.0,0.995215,1.0,1.0,1.0,0.656331,0.768533,0.763512,0.380282,0.964286,0.910506
450,0.0104,0.00297,0.593463,0.787263,0.777498,1.0,0.714286,0.722222,1.0,0.863636,0.86819,1.0,0.25,0.257426,0.888889,1.0,0.995215,1.0,1.0,1.0,0.582579,0.779123,0.769142,0.490909,0.964286,0.929801
500,0.0053,0.002725,0.67497,0.771003,0.766807,1.0,1.0,1.0,1.0,0.909091,0.912281,1.0,0.25,0.257426,0.888889,1.0,0.995215,1.0,1.0,1.0,0.666667,0.753404,0.749653,0.529412,0.964286,0.934754


The following columns in the evaluation set don't have a corresponding argument in `BigBirdForTokenClassification.forward` and have been ignored: full_text, token_map, length, document, tokens, provided_labels, offset_mapping, trailing_whitespace. If full_text, token_map, length, document, tokens, provided_labels, offset_mapping, trailing_whitespace are not expected by `BigBirdForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1689
  Batch size = 8
Saving model checkpoint to output/fold_2/checkpoint-50
Configuration saved in output/fold_2/checkpoint-50/config.json
Model weights saved in output/fold_2/checkpoint-50/pytorch_model.bin
tokenizer config file saved in output/fold_2/checkpoint-50/tokenizer_config.json
Special tokens file saved in output/fold_2/checkpoint-50/special_tokens_map.json
  torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
The following columns in the eva

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f5,▁▃▅▇▇█▇█████████
eval/f5-EMAIL,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/f5-ID_NUM,▁▁▆▂▄█▆█▆█▄█████
eval/f5-NAME_STUDENT,▁▂▅▇██▇█████████
eval/f5-PHONE_NUM,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/f5-STREET_ADDRESS,▁▃▇▇▇▇████▇█████
eval/f5-URL_PERSONAL,▁▆▂▅▆███████████
eval/f5-USERNAME,▁▁▁▁▅▅▅▄▅▅███▄▅▄
eval/loss,█▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
eval/p-EMAIL,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/f5,0.80648
eval/f5-EMAIL,0.99522
eval/f5-ID_NUM,1.0
eval/f5-NAME_STUDENT,0.79417
eval/f5-PHONE_NUM,1.0
eval/f5-STREET_ADDRESS,0.91228
eval/f5-URL_PERSONAL,0.93475
eval/f5-USERNAME,0.25243
eval/loss,0.00272
eval/p-EMAIL,0.88889


         

#0:   0%|          | 0/504 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/503 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/503 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/503 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/503 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/503 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/503 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/503 [00:00<?, ?ex/s]

         

#0:   0%|          | 0/214 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/214 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/213 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/213 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/213 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/213 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/213 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/213 [00:00<?, ?ex/s]

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BigBirdForTokenClassification.forward` and have been ignored: full_text, token_map, length, document, tokens, provided_labels, offset_mapping, trailing_whitespace. If full_text, token_map, length, document, tokens, provided_labels, offset_mapping, trailing_whitespace are not expected by `BigBirdForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4025
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 756
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


  torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)


Step,Training Loss,Validation Loss,Precision,Recall,F5,P-id Num,R-id Num,F5-id Num,P-street Address,R-street Address,F5-street Address,P-username,R-username,F5-username,P-email,R-email,F5-email,P-phone Num,R-phone Num,F5-phone Num,P-name Student,R-name Student,F5-name Student,P-url Personal,R-url Personal,F5-url Personal
50,0.1437,0.019536,1.0,0.034146,0.035464,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0516,0.009021,0.268262,0.346341,0.342507,0.454545,0.416667,0.418006,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.286624,0.323741,0.322137,0.06087,0.28,0.245946
150,0.0326,0.005489,0.361019,0.530081,0.520703,1.0,0.833333,0.83871,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.343902,0.507194,0.498098,0.464286,0.52,0.517611
200,0.0188,0.00402,0.583673,0.697561,0.692365,1.0,0.166667,0.172185,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.572674,0.708633,0.702221,0.5,0.48,0.48074
250,0.0113,0.004076,0.548712,0.796748,0.783133,0.916667,0.916667,0.916667,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.571056,0.78777,0.776437,0.222222,0.8,0.727273
300,0.0089,0.003986,0.504892,0.839024,0.818198,0.916667,0.916667,0.916667,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.496809,0.839928,0.818194,0.425,0.68,0.664662
350,0.0096,0.00305,0.653439,0.803252,0.796231,0.785714,0.916667,0.910828,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.653959,0.802158,0.795227,0.571429,0.64,0.63706
400,0.0087,0.003285,0.581315,0.819512,0.806797,0.578947,0.916667,0.896552,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.597113,0.818345,0.806848,0.354167,0.68,0.656761
450,0.0062,0.003102,0.559742,0.845528,0.829244,0.785714,0.916667,0.910828,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.55608,0.847122,0.830406,0.435897,0.68,0.665663
500,0.0068,0.002512,0.756672,0.78374,0.782663,0.846154,0.916667,0.913738,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.753043,0.778777,0.777755,0.607143,0.68,0.676876


The following columns in the evaluation set don't have a corresponding argument in `BigBirdForTokenClassification.forward` and have been ignored: full_text, token_map, length, document, tokens, provided_labels, offset_mapping, trailing_whitespace. If full_text, token_map, length, document, tokens, provided_labels, offset_mapping, trailing_whitespace are not expected by `BigBirdForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1706
  Batch size = 8
Saving model checkpoint to output/fold_3/checkpoint-50
Configuration saved in output/fold_3/checkpoint-50/config.json
Model weights saved in output/fold_3/checkpoint-50/pytorch_model.bin
tokenizer config file saved in output/fold_3/checkpoint-50/tokenizer_config.json
Special tokens file saved in output/fold_3/checkpoint-50/special_tokens_map.json
  torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
The following columns in the eva

## Log CV

In [None]:
wandb.init(name="cv")
results = dict()
for res_json_path in Path(OUTPUT_DIR).glob("fold*/eval_result.json"):
    fold = res_json_path.parent.name.split("_")[-1]
    with open(res_json_path, "r") as f:
        res = json.load(f)
        results[fold] = {k.replace("eval_", ""): v for k, v in res.items()}
results["cv"] = {key: np.mean([r[key] for r in results.values()]) for key in results["0"].keys()}
table = wandb.Table(columns=["fold"] + list(results["0"].keys()))
for f, res in results.items():
    table.add_data(f, *[res[c] for c in table.columns if c != "fold"])
wandb.log({"eval_result": table})
wandb.finish()