In [1]:
!pip install evaluate seqeval

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: http://nexus.charisma:5080/repository/pypi-proxy/simple


In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from transformers import DataCollatorForTokenClassification
import evaluate
import torch

In [3]:
seqeval = evaluate.load("seqeval")

In [16]:
dataset = load_dataset("ai4privacy/pii-masking-200k", split='train')

split_ds = dataset.train_test_split(test_size=0.2, seed=42, shuffle=True)
train_dataset = split_ds["train"]
temp_dataset = split_ds["test"]

val_test = temp_dataset.train_test_split(test_size=0.5, seed=42, shuffle=True)
val_dataset = val_test["train"]
test_dataset = val_test["test"]

print(f'Train: {len(train_dataset)}')
print(f'Val: {len(val_dataset)}')
print(f'Test: {len(test_dataset)}')

Train: 167408
Val: 20926
Test: 20927


In [17]:
all_labels = set()

def get_labels(item):
    global all_labels
    all_labels.update(set(item['mbert_bio_labels']))

dataset.map(get_labels, batched=False)

Map:   0%|          | 0/209261 [00:00<?, ? examples/s]

Dataset({
    features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
    num_rows: 209261
})

In [18]:
id2label = {0: 'O'}
label2id = {'O': 0}

idx = 1
for label in all_labels:
    if label == 'O': continue
    id2label[idx] = label
    label2id[label] = idx

    idx += 1

In [20]:
def bio_labels2ner_tags(examples):
    global label2id

    converted_labels = list()

    for labels in examples["mbert_bio_labels"]:
        label_ids = [label2id[label] for label in labels]
        converted_labels.append(label_ids)

    return {'ner_tags': converted_labels}

train_dataset = train_dataset.map(bio_labels2ner_tags, batched=True)
test_dataset = test_dataset.map(bio_labels2ner_tags, batched=True)
val_dataset = val_dataset.map(bio_labels2ner_tags, batched=True)

Map:   0%|          | 0/167408 [00:00<?, ? examples/s]

Map:   0%|          | 0/20927 [00:00<?, ? examples/s]

Map:   0%|          | 0/20926 [00:00<?, ? examples/s]

In [21]:
model_name = "bert-base-multilingual-cased" #"distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [22]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["mbert_text_tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [23]:
train_tok_ds = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tok_ds = test_dataset.map(tokenize_and_align_labels, batched=True)
val_tok_ds = val_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/167408 [00:00<?, ? examples/s]

Map:   0%|          | 0/20927 [00:00<?, ? examples/s]

Map:   0%|          | 0/20926 [00:00<?, ? examples/s]

In [24]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [25]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [26]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
training_args = TrainingArguments(
    output_dir="ner_model",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    dataloader_pin_memory=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok_ds,
    eval_dataset=val_tok_ds,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [32]:
import gc
gc.collect()

torch.cuda.empty_cache()

In [None]:
model.train()
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0729,0.066952,0.920541,0.940512,0.93042,0.973209
2,0.0584,0.057867,0.933205,0.948414,0.940748,0.975726
3,0.0519,0.058533,0.934719,0.950734,0.942658,0.975014
4,0.0401,0.057277,0.943268,0.955914,0.949549,0.978382
5,0.0352,0.056284,0.947699,0.95995,0.953785,0.982665
6,0.0239,0.065754,0.946728,0.961821,0.954215,0.983214
7,0.0126,0.06964,0.952593,0.964156,0.958339,0.984526
8,0.0125,0.075214,0.955504,0.965965,0.960706,0.985508
9,0.0087,0.079035,0.958856,0.96796,0.963386,0.986131


In [19]:
example = test_tok_ds[0]

input_ids = torch.tensor(example['input_ids']).unsqueeze(0).cuda()
attention_mask = torch.tensor(example['attention_mask']).unsqueeze(0).cuda()
labels = torch.tensor(example['labels']).unsqueeze(0).cuda()

with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    logits = outputs.logits

predictions = logits.cpu().numpy()
labels_np = labels.cpu().numpy()

res = compute_metrics((predictions, labels_np))

In [20]:
res

{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'accuracy': 1.0}

In [22]:
labels

tensor([[-100,    0,    0, -100, -100,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0, -100, -100, -100,    0, -100, -100,    0,
            0,    0,    0,    0, -100, -100, -100,    0,   55,   92,   92,   92,
            0,    0,    0, -100, -100,    0,   73,  108,   72,    2, -100, -100,
            2,    2, -100, -100,    0,    0,    0, -100, -100, -100,    0, -100,
         -100,    0,    0,    0, -100, -100, -100,    0, -100, -100,    0,    0,
            0,    0, -100, -100,    0, -100, -100, -100,    0, -100]],
       device='cuda:0')

In [21]:
labels_np

array([[-100,    0,    0, -100, -100,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0, -100, -100, -100,    0, -100,
        -100,    0,    0,    0,    0,    0, -100, -100, -100,    0,   55,
          92,   92,   92,    0,    0,    0, -100, -100,    0,   73,  108,
          72,    2, -100, -100,    2,    2, -100, -100,    0,    0,    0,
        -100, -100, -100,    0, -100, -100,    0,    0,    0, -100, -100,
        -100,    0, -100, -100,    0,    0,    0,    0, -100, -100,    0,
        -100, -100, -100,    0, -100]])