In [1]:
import re
import json
import argparse
from itertools import chain
from functools import partial

import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
from scipy.special import softmax
import evaluate
from datasets import Dataset, features
import numpy as np
import pandas as pd
from pathlib import Path

In [4]:
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

target = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL'
]

print(id2label)

{0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'O'}


In [18]:
def tokenize(example, tokenizer, label2id, max_length):

    # rebuild text from tokens
    text = []
    labels = []

    for t, l, ws in zip(
        example["tokens"], example["provided_labels"], example["trailing_whitespace"]
    ):
        text.append(t)
        labels.extend([l] * len(t))

        if ws:
            text.append(" ")
            labels.append("O")

    # actual tokenization
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, max_length=max_length)

    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:
        # CLS token
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {**tokenized, "labels": token_labels, "length": length}

In [19]:
# Tokenizing the data utilizing deberta tokenizer
TRAIN_MODEL_PATH = "microsoft/deberta-base"
TRAIN_MAX_LENGTH = 1024

tokenizer = AutoTokenizer.from_pretrained(TRAIN_MODEL_PATH)

ds = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data],
    "document": [str(x["document"]) for x in data],
    "tokens": [x["tokens"] for x in data],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data],
    "provided_labels": [x["labels"] for x in data],
})

ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id, "max_length": TRAIN_MAX_LENGTH}, num_proc=3)

Map (num_proc=3):   0%|          | 0/9162 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

In [8]:
# def downsample(df, percent):
#     df = df.copy()

#     df['is_labels'] = df['provided_labels'].apply(lambda labels: any(label != "O" for label in labels))
#     true_samples = df[df['is_labels'] == True]
#     false_samples = df[df['is_labels'] == False]

#     downsampled_false_samples = false_samples.sample(frac=percent, random_state=42)


#     return pd.concat([true_samples, downsampled_false_samples])

# # Downsample the negative samples of the dataset
# df_train = pd.DataFrame(ds)
# df_train = downsample(df_train, 0.2)
# df_train = df_train.drop(columns=['is_labels'])

# ds = Dataset.from_pandas(df_train)
# # Splitting the dataset into training and validation sets for performance evaluation
# ds = ds.train_test_split(test_size=0.1, seed=42)

# Inference
This is a slightly different procedure. We are implementing a weighted voting approach

In [5]:
def inference_tokenize(example, tokenizer, max_length):

    # rebuild text from tokens
    text = []
    token_map = []
    
    idx = 0
    
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(t)
        token_map.extend([idx]*len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)
        idx += 1

    # actual tokenization
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, max_length=max_length)

    return {**tokenized, "token_map": token_map}

In [6]:
model_paths = {"deberta3base_1024_fold_0": 0.9750033522765696, 
               "deberta3base_1024_fold_1": 0.8240018195303137, 
               "deberta3base_1024_fold_2": 0.7366411375074039, 
               "deberta3base_1024_fold_3": 0.01050751452921163, 
               "deberta3base_1024_fold_4": 0.01885993245619963}

In [10]:
# Ensemble method prediction
def ensemble_predict(model_paths, dataset):
    all_preds = []

    for path, weight in model_paths.items():
        model = AutoModelForTokenClassification.from_pretrained(path)

        # if torch.backends.mps.is_available():
        #     mps_device = torch.device("mps")
        #     model.to(mps_device)
        #     print("Model moved to MPS device.")
        # elif torch.cuda.is_available():
        #     model.cuda()
        #     print("Model moved to CUDA device.")
        # else:
        #     print("No GPU available, using CPU.")

        tokenizer = AutoTokenizer.from_pretrained(path)
        collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

        args = TrainingArguments(
            ".",
            per_device_eval_batch_size=1,
            report_to="none",
        )

        trainer = Trainer(
            model,
            args,
            data_collator=collator,
            tokenizer=tokenizer,
        )

        predictions = trainer.predict(dataset).predictions
        weighted_predictions = softmax(predictions, axis=-1)
        print(type(weighted_predictions))
        print(weight)
        weighted_predictions = weighted_predictions * weight
        all_preds.append(weighted_predictions)

    weighted_average_preds = np.sum(all_preds, axis=0) / sum(model_paths.values())

    return weighted_average_preds


def predict(model, dataset):
    model = AutoModelForTokenClassification.from_pretrained(model)
    tokenizer = AutoTokenizer.from_pretrained(model)
    collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

    args = TrainingArguments(
        ".",
        per_device_eval_batch_size=1,
        report_to="none",
    )

    trainer = Trainer(
        model,
        args,
        data_collator=collator,
        tokenizer=tokenizer,
    )

    predictions = trainer.predict(dataset).predictions
    weighted_predictions = softmax(predictions, axis=-1)
    

In [11]:
# TODO: insert test data
test_data = json.load(open("kaggle/input/pii-detection-removal-from-educational-data/test.json"))
test_frame = pd.DataFrame(test_data)
ds_test = Dataset.from_pandas(test_frame)
tokenizer = AutoTokenizer.from_pretrained("deberta3base_1024_fold_0")
ds_test = ds_test.map(inference_tokenize, fn_kwargs={"tokenizer": tokenizer, "max_length": 2048}, num_proc=3)

model_path = "deberta3base_1024_fold_0"
threshold = 0.99
weighted_average_predictions = ensemble_predict(model_paths, ds_test)

config = json.load(open(Path(model_path) / "config.json"))
id2label = config["id2label"]
preds = weighted_average_predictions.argmax(-1)
preds_without_O = weighted_average_predictions[:,:,:12].argmax(-1)
O_preds = weighted_average_predictions[:,:,12]
preds_final = np.where(O_preds < threshold, preds_without_O , preds)

Map (num_proc=3):   0%|          | 0/10 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/10 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
0.2


  0%|          | 0/10 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
0.2


  0%|          | 0/10 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
0.2


  0%|          | 0/10 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
0.2


  0%|          | 0/10 [00:00<?, ?it/s]

<class 'numpy.ndarray'>
0.2


In [12]:
triplets = []
document, token, label, token_str = [], [], [], []
for p, token_map, offsets, tokens, doc in zip(preds_final, ds_test["token_map"], ds_test["offset_mapping"], ds_test["tokens"], ds_test["document"]):

    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]

        if start_idx + end_idx == 0: continue

        if token_map[start_idx] == -1:
            start_idx += 1

        # ignore "\n\n"
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        if start_idx >= len(token_map): break

        token_id = token_map[start_idx]

        # ignore "O" predictions and whitespace preds
        if label_pred != "O" and token_id != -1:
            triplet = (label_pred, token_id, tokens[token_id])

            if triplet not in triplets:
                document.append(doc)
                token.append(token_id)
                label.append(label_pred)
                token_str.append(tokens[token_id])
                triplets.append(triplet)

df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "token_str": token_str
})

df["row_id"] = list(range(len(df)))
df.head()

Unnamed: 0,document,token,label,token_str,row_id
0,7,4,B-NAME_STUDENT,reflexion,0
1,7,5,B-NAME_STUDENT,-,1
2,7,6,I-NAME_STUDENT,Avril,2
3,7,7,I-NAME_STUDENT,2021,3
4,7,8,I-NAME_STUDENT,-,4


In [None]:
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)