In [4]:
%env WANDB_PROJECT=PII
%env WANDB_LOG_MODEL=end

env: WANDB_PROJECT=PII
env: WANDB_LOG_MODEL=end


In [5]:
!pip install -q seqeval evaluate

[0m

In [6]:
import json
import copy
import gc
import os
from pathlib import Path

import numpy as np
from transformers.tokenization_utils import PreTrainedTokenizerBase
from transformers.models.bert import BertForTokenClassification, BertTokenizerFast
from transformers.trainer import Trainer
from transformers.training_args import TrainingArguments
from transformers.trainer_utils import EvalPrediction
from transformers.data.data_collator import DataCollatorForTokenClassification
from sklearn.model_selection import KFold
from datasets import Dataset
from seqeval.metrics import recall_score, precision_score
import wandb

In [7]:
wandb.login(key="eff994fe72307679c21248b6e7859e26960b8db7")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Config & Parameters

In [8]:
DATA_DIR = Path("../dataset/")
OUTPUT_DIR = "bert_checkpoints"

In [16]:
TRAINING_MODEL_PATH = "bert-base-cased"
TRAINING_MAX_LENGTH = 2048
TASK_LAYER_DROPOUT = 0.1
LR = 2e-5  # 1.5e-5 ~ 3e-5
LR_SCHEDULER_TYPE = "constant_with_warmup"
NUM_EPOCHS = 3
BATCH_SIZE = 8
GRAD_ACCUMULATION_STEPS = 8
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01
AMP = True

In [10]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    fp16=AMP,
    learning_rate=LR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
    report_to="wandb",
    evaluation_strategy="steps",
    eval_steps=50,
    save_total_limit=1,
    save_strategy="steps",
    save_steps=50,
    logging_steps=10,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    metric_for_best_model="f5",
    greater_is_better=True,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=WEIGHT_DECAY,
    load_best_model_at_end=True,
)

## Dataset Preparation

In [11]:
with DATA_DIR.joinpath("train.json").open("r") as f:
    original_data = json.load(f)

# downsampling of negative examples
p=[] # positive samples (contain relevant labels)
n=[] # negative samples (presumably contain entities that are possibly wrongly classified as entity)

for d in original_data:
    if any(np.array(d["labels"]) != "O"):
        p.append(d)
    else:
        n.append(d)

print("original datapoints: ", len(original_data))
print("positive datapoints:", len(p))
print("negative datapoints", len(n))

with DATA_DIR.joinpath("pii_dataset_fixed.json").open("r") as f:
    external_1 = json.load(f)
print("external_1 datapoints: ", len(external_1))

with DATA_DIR.joinpath("moredata_dataset_fixed.json").open("r") as f:
    external_2 = json.load(f)
print("external_2 datapoints: ", len(external_2))

data = external_1 + external_2 + p + n[:len(n)//3]
print("combined: ", len(data))

original datapoints:  6807
positive datapoints: 945
negative datapoints 5862
external_1 datapoints:  4434
external_2 datapoints:  2000
combined:  9333


In [12]:
all_labels = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'O'
]
id2label = {i: l for i, l in enumerate(all_labels)}
label2id = {v: k for k, v in id2label.items()}
target = [l for l in all_labels if l != "O"]

## Tokenization

In [13]:
class CustomTokenizer:
    def __init__(self, tokenizer: PreTrainedTokenizerBase, label2id: dict, max_length: int) -> None:
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_length = max_length

    def __call__(self, example: dict) -> dict:
        # rebuild text from tokens
        text = []
        labels = []

        for t, l, ws in zip(
            example["tokens"], example["provided_labels"], example["trailing_whitespace"]
        ):
            text.append(t)
            labels.extend([l] * len(t))

            if ws:
                text.append(" ")
                labels.append("O")

        text = "".join(text)
        labels = np.array(labels)

        # actual tokenization
        tokenized = self.tokenizer(
            "".join(text),
            return_offsets_mapping=True,
            truncation=True,
            max_length=self.max_length
        )

        token_labels = []

        for start_idx, end_idx in tokenized.offset_mapping:
            # CLS token
            if start_idx == 0 and end_idx == 0:
                token_labels.append(self.label2id["O"])
                continue

            # case when token starts with whitespace
            if text[start_idx].isspace():
                start_idx += 1

            token_labels.append(self.label2id[labels[start_idx]])

        length = len(tokenized.input_ids)

        return {**tokenized, "labels": token_labels, "length": length}

## Instantiate the dataset

In [17]:
tokenizer = BertTokenizerFast.from_pretrained(TRAINING_MODEL_PATH)
encode = CustomTokenizer(tokenizer=tokenizer, label2id=label2id, max_length=TRAINING_MAX_LENGTH)

ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [str(x["document"]) for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
    "provided_labels": [x["labels"] for x in data],
})
ds = ds.map(encode, num_proc=os.cpu_count())

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

         

#0:   0%|          | 0/1167 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/1167 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/1167 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/1167 [00:00<?, ?ex/s]

 

#4:   0%|          | 0/1167 [00:00<?, ?ex/s]

 

#5:   0%|          | 0/1166 [00:00<?, ?ex/s]

 

#6:   0%|          | 0/1166 [00:00<?, ?ex/s]

 

#7:   0%|          | 0/1166 [00:00<?, ?ex/s]

## Metrics

In [18]:
class MetricsComputer:
    def __init__(self, all_labels: list[str], beta: float = 5.0) -> None:
        self.all_labels = all_labels
        self.beta = beta
        
    def __call__(self, preds: EvalPrediction) -> dict[str, float]:
        predictions, labels = preds
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [self.all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [self.all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        recall = recall_score(true_labels, true_predictions)
        precision = precision_score(true_labels, true_predictions)
        f5_score = (1 + self.beta ** 2) * recall * precision / ((self.beta ** 2) * precision + recall)

        results = {
            'recall': recall,
            'precision': precision,
            'f5': f5_score
        }
        return results
    
compute_metrics = MetricsComputer(all_labels=all_labels)

## Model

In [21]:
class ModelInit:
    def __init__(
        self, checkpoint: str, id2label: dict, label2id: dict, drop_p: float = 0.1,
    ) -> None:
        self.model = BertForTokenClassification.from_pretrained(
            checkpoint,
            num_labels=len(id2label),
            id2label=id2label,
            label2id=label2id,
            ignore_mismatched_sizes="tiny-random" in TRAINING_MODEL_PATH
        )
        self.model.dropout.p = drop_p
        self.weight = copy.deepcopy(self.model.state_dict())

    def __call__(self) -> BertForTokenClassification:
        self.model.load_state_dict(self.weight)
        return self.model

model_init = ModelInit(TRAINING_MODEL_PATH, id2label=id2label, label2id=label2id)

Downloading pytorch_model.bin:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

## Split

In [24]:
# random split
cv = KFold(n_splits=5, shuffle=True, random_state=42)
folds = list(cv.split(X=np.zeros(len(ds)), y=np.zeros(len(ds))))

## Train

In [25]:
for fold_idx, (train_idx, eval_idx) in enumerate(folds):
    args.run_name = f"{TRAINING_MODEL_PATH}-fold_{fold_idx}"
    args.output_dir = os.path.join(OUTPUT_DIR, f"fold_{fold_idx}")
    trainer = Trainer(
        args=args,
        model_init=model_init,
        train_dataset=ds.select(train_idx),
        eval_dataset=ds.select(eval_idx),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        data_collator=DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16),
    )
    trainer.train()
    wandb.finish()
    del trainer
    gc.collect()

Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: document, full_text, trailing_whitespace, provided_labels, tokens, length, offset_mapping. If document, full_text, trailing_whitespace, provided_labels, tokens, length, offset_mapping are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7466
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 1401
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33memiz6413[0m. Use [1m`wandb login --relogin`[0m to force relogin


RuntimeError: The size of tensor a (768) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
args.evaluation_strategy = "no"
args.run_name = f"{TRAINING_MODEL_PATH}-all-data"
args.output_dir = os.path.join(OUTPUT_DIR, f"all_data")
trainer = Trainer(
    args=args,
    model_init=model_init,
    train_dataset=ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16),
)
trainer.train()
wandb.finish()