In [1]:
pip install torch

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
torch.cuda.get_device_name()

'Tesla T4'

In [3]:
!pip install -U datasets
!pip install -U transformers
!pip install seqeval
!pip install evaluate
!pip install 'accelerate>=0.26.0'




# Importing Dependency

In [4]:
# Cell 1: Imports & Setup
import json
import time
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import defaultdict
from datasets import load_dataset, DatasetDict

from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    pipeline
)
import evaluate


  from pandas.core.computation.check import NUMEXPR_INSTALLED


## 1. Shared Tokenization & Metric Logic


In [6]:
# Cell 3: Tokenize + Align Function
def tokenize_and_align(examples, tokenizer, label2id):
    texts = examples["source_text"]
    masks = examples["privacy_mask"]
    
    # tokenize returns offset_mapping for each example
    tokenized_inputs = tokenizer(
        texts,
        truncation=True,
        max_length=512,
        padding=False,
        return_offsets_mapping=True
    )

    all_labels = []
    for i, offsets in enumerate(tokenized_inputs.pop("offset_mapping")):
        text = texts[i]
        raw_spans = masks[i]

        # Reconstruct spans with start/end from each value
        spans = []
        for sp in raw_spans:
            val = sp["value"]
            start = text.find(val)
            if start == -1:
                # skip if not found
                continue
            spans.append({
                "label": sp["label"],
                "start": start,
                "end":   start + len(val)
            })

        # Align each token to a label
        label_seq = []
        for (tok_start, tok_end) in offsets:
            tag = "NONPII"
            for sp in spans:
                if tok_start >= sp["start"] and tok_end <= sp["end"]:
                    tag = sp["label"]
                    break
            # default to NONPII if unknown
            label_seq.append(label2id.get(tag, label2id["NONPII"]))
        all_labels.append(label_seq)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs


# Cell 4: Metrics Factory (with accuracy + seqeval)
def compute_metrics_factory(id2label):
    seqeval    = evaluate.load("seqeval")
    acc_metric = evaluate.load("accuracy")

    def compute_metrics(p):
        preds  = np.argmax(p.predictions, axis=2)
        labels = p.label_ids

        flat_preds, flat_labels = [], []
        for pred_seq, lab_seq in zip(preds, labels):
            for p_i, l_i in zip(pred_seq, lab_seq):
                if l_i != -100:
                    flat_preds.append(p_i)
                    flat_labels.append(l_i)

        acc = acc_metric.compute(
            predictions=flat_preds,
            references=flat_labels
        )["accuracy"]

        str_preds  = [id2label[i] for i in flat_preds]
        str_labels = [id2label[i] for i in flat_labels]
        ner_results = seqeval.compute(
            predictions=[str_preds],
            references=[str_labels]
        )

        return {
            "accuracy":  acc,
            "precision": ner_results["overall_precision"],
            "recall":    ner_results["overall_recall"],
            "f1":        ner_results["overall_f1"]
        }

    return compute_metrics



## 2. Prepare Trainer per Scenario


In [7]:
# Cell 5: Trainer Preparation Function
def prepare_trainer_for_uids(
    json_path: str,
    uid_filter,
    checkpoint="iiiorg/piiranha-v1-detect-personal-information"
):

    
    # 1. Load as a single Dataset
    raw_ds = load_dataset("json", data_files=json_path, split="train")
    
    # 2. Parse `privacy_mask` strings into real lists
    def _parse_mask(ex):
        pm = ex["privacy_mask"]
        if isinstance(pm, str):
            ex["privacy_mask"] = json.loads(pm)
        return ex
    
    raw_ds = raw_ds.map(_parse_mask)
    
    # 3. Now filter to get a Dataset you can iterate
    subset = raw_ds.filter(uid_filter)

    # 2. Build label maps
    labels = sorted({sp["label"] for ex in subset for sp in ex["privacy_mask"]})
    if "NONPII" not in labels:
        labels.insert(0, "NONPII")
    label2id = {l:i for i,l in enumerate(labels)}
    id2label = {i:l for l,i in label2id.items()}

    # 3. Tokenizer & model
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    tokenizer.model_max_length = 512
    model = AutoModelForTokenClassification.from_pretrained(
        checkpoint,
        num_labels=len(labels),
        ignore_mismatched_sizes=True,
        label2id=label2id,
        id2label=id2label
    )

    # 4. Tokenize + align
    tokenized = subset.map(
        lambda ex: tokenize_and_align(ex, tokenizer, label2id),
        batched=True,
        remove_columns=subset.column_names
    )

    # 5. 60/20/20 split
    split1 = tokenized.train_test_split(test_size=0.4, seed=42)
    split2 = split1["test"].train_test_split(test_size=0.5, seed=42)
    ds = DatasetDict({
        "train":      split1["train"],
        "validation": split2["train"],
        "test":       split2["test"]
    })

    print(f"Train size: {len(ds['train'])}")
    print(f"Validation size: {len(ds['validation'])}")
    print(f"Test size: {len(ds['test'])}")

    # 6. Data collator & metrics
    data_collator  = DataCollatorForTokenClassification(tokenizer)
    compute_metrics = compute_metrics_factory(id2label)

    # 7. Training args
    args = TrainingArguments(
        output_dir="none",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=60,
        fp16=torch.cuda.is_available(),
        eval_strategy="steps",
        eval_steps=50,
        save_steps=50,
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none"
    )

    # 8. Assemble Trainer
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds["train"],
        eval_dataset=ds["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )
    return trainer


## 3. Run & Compare All Three Scenarios


In [8]:
# Cell 6: Scenario Loop (60/20/20 data already baked in)
scenarios = [
    ("Complete Dataset", lambda ex: True),
    # you can add other filters here if needed
]

results = []
for title, filt in scenarios:
    print(f"\n▶ Scenario: {title}")
    trainer = prepare_trainer_for_uids("labeled_output.json", filt)
    out_dir = f"piiranha-finetuned/{title.replace(' ', '_')}"
    trainer.args.output_dir = out_dir

    trainer.train()
    trainer.save_model(out_dir)

    m = trainer.evaluate()  # on validation
    results.append({
        "Scenario":  title,
        "Accuracy":  round(m["eval_accuracy"], 4),
        "Eval Loss": round(m["eval_loss"],    4),
        "Precision": round(m["eval_precision"], 4),
        "Recall":    round(m["eval_recall"],    4),
        "F1 Score":  round(m["eval_f1"],        4),
    })

df = pd.DataFrame(results)
print("\n### Comparison Table\n")
print(df.to_markdown(index=False))



▶ Scenario: Complete Dataset


Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at iiiorg/piiranha-v1-detect-personal-information and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([18]) in the checkpoint and torch.Size([9]) in the model instantiated
- classifier.weight: found shape torch.Size([18, 768]) in the checkpoint and torch.Size([9, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train size: 191
Validation size: 64
Test size: 64


  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,0.115,0.158409,0.955673,0.717195,0.7254,0.721274
100,0.0296,0.184309,0.964157,0.797872,0.858124,0.826902
150,0.0103,0.21815,0.963945,0.794926,0.860412,0.826374
200,0.004,0.21009,0.969247,0.836601,0.878719,0.857143
250,0.0023,0.252421,0.964369,0.819355,0.871854,0.844789
300,0.0015,0.252911,0.969459,0.864865,0.878719,0.871737
350,0.001,0.24932,0.968187,0.836245,0.87643,0.855866
400,0.0015,0.250999,0.97052,0.852097,0.883295,0.867416
450,0.0009,0.257572,0.969035,0.839479,0.885584,0.861915


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



### Comparison Table

| Scenario         |   Accuracy |   Eval Loss |   Precision |   Recall |   F1 Score |
|:-----------------|-----------:|------------:|------------:|---------:|-----------:|
| Complete Dataset |     0.9695 |      0.2529 |      0.8649 |   0.8787 |     0.8717 |


  _warn_prf(average, modifier, msg_start, len(result))


## 4. Inference Example


In [12]:
import json, random, time
import pandas as pd
import torch
from datasets import Dataset
from collections import defaultdict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    Trainer,
    pipeline,
)
# assume tokenize_and_align & compute_metrics_factory are already defined

# 1. Load wrapped_by_class.json
with open("wrapped_by_class.json", "r", encoding="utf-8") as f:
    wrapped = json.load(f)  # dict: subclass → list of records

# 2. Load your fine-tuned model & tokenizer
model_dir = "piiranha-finetuned/Complete_Dataset/checkpoint-400"   
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model     = AutoModelForTokenClassification.from_pretrained(model_dir).to(
    torch.device("cuda" if torch.cuda.is_available() else "cpu")
)
data_collator  = DataCollatorForTokenClassification(tokenizer)
compute_metrics = compute_metrics_factory(model.config.id2label)

# 3. Build a pipeline for timing
device = 0 if torch.cuda.is_available() else -1
ner_pipe = pipeline(
    "token-classification",
    model=model,
    tokenizer=tokenizer,
    device=device,
    aggregation_strategy="simple"  # optional, but faster
)

results = []

for subclass, examples in wrapped.items():
    if not examples:
        continue

    # 4a. Shuffle & split 60/20/20
    random.seed(42)
    random.shuffle(examples)
    n = len(examples)
    i1 = int(0.6 * n)
    i2 = i1 + int(0.2 * n)
    test_slice = examples[i2:]  # last 20%

    # 4b. Measure per-example latency
    latencies = []
    for rec in test_slice:
        start = time.time()
        _ = ner_pipe(rec["source_text"])
        latencies.append(time.time() - start)
    avg_latency = sum(latencies) / len(latencies)

    # 4c. Prepare a HuggingFace Dataset for evaluation
    ds_test = Dataset.from_list([
        {"source_text": rec["source_text"], "privacy_mask": rec["privacy_mask"]}
        for rec in test_slice
    ])
    tokenized = ds_test.map(
        lambda ex: tokenize_and_align(ex, tokenizer, model.config.label2id),
        batched=True,
        remove_columns=["source_text", "privacy_mask"]
    )

    # 4d. Evaluate accuracy & F1
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    metrics = trainer.evaluate(eval_dataset=tokenized)

    # 5. Record all three
    results.append({
        "class_name": subclass,
        "accuracy":   round(metrics["eval_accuracy"], 4),
        "f1":         round(metrics["eval_f1"],       4),
        "latency":    round(avg_latency,      4),  # seconds per example
    })

# 6. Save to CSV
df = pd.DataFrame(results)
df.to_csv("class_metrics_with_latency.csv", index=False)
print(df)


Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Map:   0%|          | 0/11 [00:00<?, ? examples/s]

  trainer = Trainer(


  _warn_prf(average, modifier, msg_start, len(result))


Map:   0%|          | 0/9 [00:00<?, ? examples/s]

  trainer = Trainer(


  _warn_prf(average, modifier, msg_start, len(result))


Map:   0%|          | 0/8 [00:00<?, ? examples/s]

  trainer = Trainer(




Map:   0%|          | 0/8 [00:00<?, ? examples/s]

  trainer = Trainer(




Map:   0%|          | 0/6 [00:00<?, ? examples/s]

  trainer = Trainer(




Map:   0%|          | 0/5 [00:00<?, ? examples/s]

  trainer = Trainer(




Map:   0%|          | 0/4 [00:00<?, ? examples/s]

  trainer = Trainer(




Map:   0%|          | 0/4 [00:00<?, ? examples/s]

  trainer = Trainer(




Map:   0%|          | 0/3 [00:00<?, ? examples/s]

  trainer = Trainer(




Map:   0%|          | 0/4 [00:00<?, ? examples/s]

  trainer = Trainer(




Map:   0%|          | 0/4 [00:00<?, ? examples/s]

  trainer = Trainer(




Map:   0%|          | 0/4 [00:00<?, ? examples/s]

  trainer = Trainer(




Map:   0%|          | 0/4 [00:00<?, ? examples/s]

  trainer = Trainer(


                                class_name  accuracy      f1  latency
0       DRIVERS_LICENCE.AUS.NSW.LEARNER.V1    0.9740  0.9118   0.0554
1          DRIVERS_LICENCE.AUS.TAS.FULL.V1    0.9900  0.8793   0.0269
2         DRIVERS_LICENCE.AUS.NSW.HEAVY.V1    0.9704  0.9184   0.0270
3   DRIVERS_LICENCE.AUS.NSW.PROVISIONAL.V1    0.9938  0.9688   0.0269
4         DRIVERS_LICENCE.AUS.TAS.HEAVY.V1    0.9977  0.9565   0.0267
5          DRIVERS_LICENCE.AUS.NSW.FULL.V1    0.9835  0.8684   0.0267
6           DRIVERS_LICENCE.AUS.SA.FULL.V1    1.0000  1.0000   0.0267
7         DRIVERS_LICENCE.AUS.VIC.HEAVY.V1    1.0000  1.0000   0.0277
8   DRIVERS_LICENCE.AUS.VIC.PROVISIONAL.V1    1.0000  1.0000   0.0269
9       DRIVERS_LICENCE.AUS.TAS.LEARNER.V1    1.0000  1.0000   0.0271
10         DRIVERS_LICENCE.AUS.VIC.FULL.V1    1.0000  1.0000   0.0269
11   DRIVERS_LICENCE.AUS.SA.PROVISIONAL.V1    0.9878  0.9250   0.0277
12         DRIVERS_LICENCE.AUS.QLD.FULL.V1    1.0000  1.0000   0.0267


