In [1]:
import re
import json
import argparse
from itertools import chain
from functools import partial

import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
from scipy.special import softmax
import evaluate
from datasets import Dataset, features
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
! pip3 list

Package            Version
------------------ -----------
accelerate         0.29.3
aiohttp            3.9.5
aiosignal          1.3.1
appnope            0.1.4
asttokens          2.4.1
attrs              23.2.0
certifi            2024.2.2
charset-normalizer 3.3.2
comm               0.2.2
datasets           2.18.0
debugpy            1.8.1
decorator          5.1.1
dill               0.3.8
evaluate           0.4.1
executing          2.0.1
filelock           3.13.4
frozenlist         1.4.1
fsspec             2024.2.0
huggingface-hub    0.22.2
idna               3.7
ipykernel          6.29.4
ipython            8.23.0
ipywidgets         8.1.2
jedi               0.19.1
Jinja2             3.1.3
joblib             1.4.0
jupyter_client     8.6.1
jupyter_core       5.7.2
jupyterlab_widgets 3.0.10
kagglehub          0.2.3
MarkupSafe         2.1.5
matplotlib-inline  0.1.7
mpmath             1.3.0
multidict          6.0.5
multiprocess       0.70.16
nest-asyncio       1.6.0
networkx           3.3
nump

# Loading the kaggle provided data
* Loading jsons
* Downsampling negative samples of the data
* Defining and applying tokenizer

In [3]:
data = json.load(open("kaggle/input/pii-detection-removal-from-educational-data/train.json"))

# convert csv to json and append to data
# mistral_df = pd.read_csv("kaggle/input/pii-detection-removal-from-educational-data/100_gen_data.csv")
# mistral_df.head()
# for i in range(len(mistral_df)):
#     data.append({"tokens": mistral_df['tokenized response'][i], "labels": mistral_df['labels'][i]})

mistral_data = json.load(open("kaggle/input/pii-detection-removal-from-educational-data/mistral_data.json"))
for d in mistral_data:
    data.append(d)

# # downsampling of negative examples
# # p=[] # positive samples (contain relevant labels)
# # n=[] # negative samples (presumably contain entities that are possibly wrongly classified as entity)
# # for d in data:
# #     if any(np.array(d["labels"]) != "O"): p.append(d)
# #     else: n.append(d)
print("original datapoints: ", len(data))



original datapoints:  9162


In [4]:
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

target = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL'
]

print(id2label)

{0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'O'}


In [18]:
def tokenize(example, tokenizer, label2id, max_length):

    # rebuild text from tokens
    text = []
    labels = []

    for t, l, ws in zip(
        example["tokens"], example["provided_labels"], example["trailing_whitespace"]
    ):
        text.append(t)
        labels.extend([l] * len(t))

        if ws:
            text.append(" ")
            labels.append("O")

    # actual tokenization
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, max_length=max_length)

    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:
        # CLS token
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {**tokenized, "labels": token_labels, "length": length}

In [19]:
# Tokenizing the data utilizing deberta tokenizer
TRAIN_MODEL_PATH = "microsoft/deberta-base"
TRAIN_MAX_LENGTH = 1024

tokenizer = AutoTokenizer.from_pretrained(TRAIN_MODEL_PATH)

ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [str(x["document"]) for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
    "provided_labels": [x["labels"] for x in data],
})

ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id, "max_length": TRAIN_MAX_LENGTH}, num_proc=3)

Map (num_proc=3):   0%|          | 0/9162 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

In [8]:
# def downsample(df, percent):
#     df = df.copy()

#     df['is_labels'] = df['provided_labels'].apply(lambda labels: any(label != "O" for label in labels))
#     true_samples = df[df['is_labels'] == True]
#     false_samples = df[df['is_labels'] == False]

#     downsampled_false_samples = false_samples.sample(frac=percent, random_state=42)


#     return pd.concat([true_samples, downsampled_false_samples])

# # Downsample the negative samples of the dataset
# df_train = pd.DataFrame(ds)
# df_train = downsample(df_train, 0.2)
# df_train = df_train.drop(columns=['is_labels'])

# ds = Dataset.from_pandas(df_train)
# # Splitting the dataset into training and validation sets for performance evaluation
# ds = ds.train_test_split(test_size=0.1, seed=42)

# Training and evaluation of model
* Defining metrics (precision, recall, and f5-score)
* Training model
* Evaluating on validation data

In [13]:
from seqeval.metrics import precision_score, recall_score

def metrics(p, all_labels):
    preds, labels = p
    preds = np.argmax(preds, axis=2)
    # Remove ignored index (special tokens)
    true_predictions = [
            [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(preds, labels)
        ]
    true_labels = [
            [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(preds, labels)
        ]

    precision = precision_score(true_labels, true_predictions, average='micro')
    recall = recall_score(true_labels, true_predictions, average='micro')

    f5_score = (1 + 5 ** 2) * (precision * recall) / (5 ** 2 * precision + recall)

    results = {
        "precision": precision,
        "recall": recall,
        "f5": f5_score
    }

    return results

In [13]:
model = AutoModelForTokenClassification.from_pretrained(
    TRAIN_MODEL_PATH,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)
# mps GPU acceleration for training 
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    model.to(mps_device)
    print("Model moved to MPS device.")
elif torch.cuda.is_available():
    model.cuda()
    print("Model moved to CUDA device.")
else:
    print("No GPU available, using CPU.")

collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)


Some weights of DebertaForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model moved to MPS device.


In [20]:
dataset = ds.shuffle(seed=42).select(range(100))

num_folds = 5
fold_size = len(dataset) // num_folds

folds = []
for i in range(num_folds):
    # Calculate start and end indices
    start = i * fold_size
    end = start + fold_size if i < num_folds - 1 else len(dataset)

    # Select data for the fold
    fold = dataset.select(range(start, end))
    folds.append(fold)

# Training

In [21]:
args = TrainingArguments(
    output_dir='kaggle/output', 
    learning_rate=1e-5,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    report_to="none",
    evaluation_strategy="steps",
    eval_steps=100,
    do_eval=False,
    save_total_limit=1,
    logging_steps=20,
    lr_scheduler_type='cosine',
    metric_for_best_model="f1",
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=0.01
)

dataset = ds.shuffle(seed=42)

    # Define the number of folds
num_folds = 5
fold_size = len(dataset) // num_folds

folds = []
for i in range(num_folds):
    # Calculate start and end indices
    start = i * fold_size
    end = start + fold_size if i < num_folds - 1 else len(dataset)

    # Select data for the fold
    fold = dataset.select(range(start, end))
    folds.append(fold)
# train a different model for 5 different folds of the dataset
for i in range(num_folds):
    print(f"Training fold {i}")
    model = AutoModelForTokenClassification.from_pretrained(
        TRAIN_MODEL_PATH,
        num_labels=len(all_labels),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    )

    if torch.backends.mps.is_available():
        mps_device = torch.device("mps")
        model.to(mps_device)
        print("Model moved to MPS device.")
    elif torch.cuda.is_available():
        model.cuda()
        print("Model moved to CUDA device.")
    else:
        print("No GPU available, using CPU.")

    collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)


    trainer = Trainer(
        model,
        args,
        train_dataset=folds[i],
        data_collator=collator,
        tokenizer=tokenizer,
        compute_metrics=partial(metrics, all_labels=all_labels)
    )

    trainer.train()
    trainer.save_model(f"deberta3base_1024_fold_{i}")
    tokenizer.save_pretrained(f"deberta3base_1024_fold_{i}")


Training fold 0


Some weights of DebertaForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Model moved to MPS device.


  0%|          | 0/6 [00:00<?, ?it/s]

{'train_runtime': 30.1074, 'train_samples_per_second': 1.993, 'train_steps_per_second': 0.199, 'train_loss': 1.9978763262430828, 'epoch': 2.4}
Training fold 1


Some weights of DebertaForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model moved to MPS device.


  0%|          | 0/6 [00:00<?, ?it/s]

{'train_runtime': 153.503, 'train_samples_per_second': 0.391, 'train_steps_per_second': 0.039, 'train_loss': 2.0649898846944175, 'epoch': 2.4}
Training fold 2


Some weights of DebertaForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model moved to MPS device.


  0%|          | 0/6 [00:00<?, ?it/s]

{'train_runtime': 27.7956, 'train_samples_per_second': 2.159, 'train_steps_per_second': 0.216, 'train_loss': 2.0458219846089682, 'epoch': 2.4}
Training fold 3


Some weights of DebertaForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model moved to MPS device.


  0%|          | 0/6 [00:00<?, ?it/s]

{'train_runtime': 26.3024, 'train_samples_per_second': 2.281, 'train_steps_per_second': 0.228, 'train_loss': 2.051828225453695, 'epoch': 2.4}
Training fold 4


Some weights of DebertaForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model moved to MPS device.


  0%|          | 0/6 [00:00<?, ?it/s]

{'train_runtime': 28.9089, 'train_samples_per_second': 2.075, 'train_steps_per_second': 0.208, 'train_loss': 2.0237730344136557, 'epoch': 2.4}


# Inference
This is a slightly different procedure. We are implementing a weighted voting approach

In [5]:
def inference_tokenize(example, tokenizer, max_length):

    # rebuild text from tokens
    text = []
    token_map = []
    
    idx = 0
    
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(t)
        token_map.extend([idx]*len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)
        idx += 1

    # actual tokenization
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=max_length)

    return {**tokenized, "token_map": token_map}

In [6]:
model_paths = {"deberta3base_1024_fold_0": 2/10, 
               "deberta3base_1024_fold_1": 2/10, 
               "deberta3base_1024_fold_2": 2/10, 
               "deberta3base_1024_fold_3": 2/10, 
               "deberta3base_1024_fold_4": 2/10}

In [10]:
# Ensemble method prediction
def ensemble_predict(model_paths, dataset):
    all_preds = []

    for path, weight in model_paths.items():
        model = AutoModelForTokenClassification.from_pretrained(path)

        # if torch.backends.mps.is_available():
        #     mps_device = torch.device("mps")
        #     model.to(mps_device)
        #     print("Model moved to MPS device.")
        # elif torch.cuda.is_available():
        #     model.cuda()
        #     print("Model moved to CUDA device.")
        # else:
        #     print("No GPU available, using CPU.")

        tokenizer = AutoTokenizer.from_pretrained(path)
        collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

        args = TrainingArguments(
            ".",
            per_device_eval_batch_size=1,
            report_to="none",
        )

        trainer = Trainer(
            model,
            args,
            data_collator=collator,
            tokenizer=tokenizer,
        )

        predictions = trainer.predict(dataset).predictions
        weighted_predictions = softmax(predictions, axis=-1)
        print(type(weighted_predictions))
        print(weight)
        weighted_predictions = weighted_predictions * weight
        all_preds.append(weighted_predictions)

    weighted_average_preds = np.sum(all_preds, axis=0) / sum(model_paths.values())

    return weighted_average_preds


def predict(model, dataset):
    model = AutoModelForTokenClassification.from_pretrained(model)
    tokenizer = AutoTokenizer.from_pretrained(model)
    collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

    args = TrainingArguments(
        ".",
        per_device_eval_batch_size=1,
        report_to="none",
    )

    trainer = Trainer(
        model,
        args,
        data_collator=collator,
        tokenizer=tokenizer,
    )

    predictions = trainer.predict(dataset).predictions
    weighted_predictions = softmax(predictions, axis=-1)
    

In [11]:
# TODO: insert test data
test_data = json.load(open("kaggle/input/pii-detection-removal-from-educational-data/test.json"))
test_frame = pd.DataFrame(test_data)
ds_test = Dataset.from_pandas(test_frame)
tokenizer = AutoTokenizer.from_pretrained("deberta3base_1024_fold_0")
ds_test = ds_test.map(inference_tokenize, fn_kwargs={"tokenizer": tokenizer, "max_length": 2048}, num_proc=3)

model_path = "deberta3base_1024_fold_0"
threshold = 0.99
weighted_average_predictions = ensemble_predict(model_paths, ds_test)

config = json.load(open(Path(model_path) / "config.json"))
id2label = config["id2label"]
preds = weighted_average_predictions.argmax(-1)
preds_without_O = weighted_average_predictions[:,:,:12].argmax(-1)
O_preds = weighted_average_predictions[:,:,12]
preds_final = np.where(O_preds < threshold, preds_without_O , preds)

Map (num_proc=3):   0%|          | 0/10 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/10 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
0.2


  0%|          | 0/10 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
0.2


  0%|          | 0/10 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
0.2


  0%|          | 0/10 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
0.2


  0%|          | 0/10 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
0.2


In [12]:
triplets = []
document, token, label, token_str = [], [], [], []
for p, token_map, offsets, tokens, doc in zip(preds_final, ds_test["token_map"], ds_test["offset_mapping"], ds_test["tokens"], ds_test["document"]):

    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]

        if start_idx + end_idx == 0: continue

        if token_map[start_idx] == -1:
            start_idx += 1

        # ignore "\n\n"
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        if start_idx >= len(token_map): break

        token_id = token_map[start_idx]

        # ignore "O" predictions and whitespace preds
        if label_pred != "O" and token_id != -1:
            triplet = (label_pred, token_id, tokens[token_id])

            if triplet not in triplets:
                document.append(doc)
                token.append(token_id)
                label.append(label_pred)
                token_str.append(tokens[token_id])
                triplets.append(triplet)

df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "token_str": token_str
})

df["row_id"] = list(range(len(df)))
df.head()

Unnamed: 0,document,token,label,token_str,row_id
0,7,4,B-NAME_STUDENT,reflexion,0
1,7,5,B-NAME_STUDENT,-,1
2,7,6,I-NAME_STUDENT,Avril,2
3,7,7,I-NAME_STUDENT,2021,3
4,7,8,I-NAME_STUDENT,-,4


# Metric Testing

In [33]:
import optuna

def objective(trial, model_paths, labels, all_labels, test_data):
    # Define the search space for the weights
    weights = {}
    for model_name in model_paths:
        weights[model_name] = trial.suggest_float(model_name, 0.0, 1.0)

    # Create the ensemble model using the weights
    weighted_models = {model_name: weights[model_name] for model_name in model_paths}

    # Generate predictions using the ensemble model
    predictions = ensemble_predict(weighted_models, test_data)

    # Evaluate the predictions using the evaluation metric
    metric_value = metrics((predictions, labels), all_labels)['f5']

    return metric_value

# Create a study object and optimize the weights
data_splits = ds.shuffle(seed=42).train_test_split(test_size=0.1, seed=42)
test_data = data_splits["test"]
model_names = ["deberta3base_1024_fold_0", "deberta3base_1024_fold_1", "deberta3base_1024_fold_2", "deberta3base_1024_fold_3", "deberta3base_1024_fold_4"]

study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, model_names, test_data['labels'], all_labels, test_data), n_trials=50)

# Get the best weights and their corresponding metric value
best_weights = study.best_params.items()
best_metric = study.best_value


[I 2024-05-05 01:28:19,921] A new study created in memory with name: no-name-b694bd2c-2ae2-418d-b912-3aaa8c34634d


  0%|          | 0/917 [00:00<?, ?it/s]

In [None]:
import pickle 

# save best_weights and best_metric
with open('best_weights.pkl', 'wb') as f:
    pickle.dump(best_weights, f)

(dict_items([('deberta3base_1024_fold_0', 0.39622189060869173), ('deberta3base_1024_fold_1', 0.9769709248621039), ('deberta3base_1024_fold_2', 0.592992672873292), ('deberta3base_1024_fold_3', 0.027945723847054293), ('deberta3base_1024_fold_4', 0.23466717979570073)]),
 0.9070422535211269)

In [23]:
data_splits = ds.shuffle(seed=42).train_test_split(test_size=0.1, seed=42)
test_data = data_splits["test"]
preds = ensemble_predict(model_paths, test_data)
metrics((preds, test_data["labels"]), all_labels)

  0%|          | 0/917 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
0.2


  0%|          | 0/917 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
0.2


  0%|          | 0/917 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
0.2


  0%|          | 0/917 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
0.2


  0%|          | 0/917 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
0.2


{'precision': 0.8777042599063267,
 'recall': 0.8632641123135419,
 'f5': 0.8638107110997296}

In [16]:
# take a random sample of 10% of the data in ds

test_df = data_df.sample(frac=0.1)
test_ds = Dataset.from_pandas(test_df)
preds = ensemble_predict(model_paths, test_ds)


ArrowInvalid: ("Could not convert 'irusmgwkqg' with type str: tried to convert to int64", 'Conversion failed for column document with type object')