In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install datasets
!pip install transformers
!pip install gensim
!pip install spacy
!pip install torch
!pip install colorama



In [3]:
UNMASKED_DATASET_LOCATION = "/content/drive/MyDrive/Datasets/Unmasked"
PRETRAINED_TOKENIZER_LOCATION = "/content/drive/MyDrive/Tokenizer"

NER_DATASET_LOCATION = "/content/drive/MyDrive/Datasets/Ner"
LEXICAL_DATASET_LOCATION = "/content/drive/MyDrive/Datasets/Lexical"
IDF_DATASET_LOCATION = "/content/drive/MyDrive/Datasets/Idf"
IDF_TABLE_AWARE_DATASET_LOCATION = "/content/drive/MyDrive/Datasets/Idf_table_aware"

In [4]:
from datasets import load_from_disk, Dataset
from transformers import AutoTokenizer, AutoModel
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
import spacy
import string
import torch
from colorama import Fore, Style
import inspect
import os

# Parameters
IDF_THRESHOLD = 0.1
IDF_TABLE_AWARE_THRESHOLD = 0.05

# Loading SpaCy NER pre-trained ("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

In [5]:
# Save dataset
def save_dataset(dataset: Dataset, path) -> None:
    dataset.save_to_disk(path)

# Load roberta model
def load_roberta_model():
    return AutoModel.from_pretrained("roberta-base")

def print_colored(variable, color):
    color_map = {
        "blue": Fore.BLUE,
        "red": Fore.RED,
        "green": Fore.GREEN
    }

    if color not in color_map:
        print("Couleur non supportée.")
        return

    color_code = color_map[color]
    reset_code = Style.RESET_ALL

    frame = inspect.currentframe().f_back
    variable_name = [name for name, value in frame.f_locals.items() if value is variable][0]

    print(f"{color_code}{variable_name} = {variable}{reset_code}")

In [6]:
# Corpus for IDF methods
def get_corpus(dataset: Dataset):
    corpus = [sample["target_text"].replace("\n", "") + sample["input_text"].replace("\n", "") for sample in dataset]
    return corpus

# Embedding from a pre-trained model
def get_embeddings(model, tokens, attention_mask, name):
    with torch.no_grad():
        # Formatting tokens and attention_mask
        if isinstance(tokens, list):
            tokens = torch.LongTensor([tokens])
            attention_mask = torch.LongTensor([attention_mask])
        else:
            tokens = torch.LongTensor(tokens)
            attention_mask = torch.LongTensor(attention_mask)

        # Pass tokens through the model
        outputs = model(input_ids=tokens, attention_mask=attention_mask)

        # Extract embeddings from the last layer
        embeddings = outputs.last_hidden_state

        # Take the mean of embeddings along the sequence dimension
        mean_embeddings = embeddings.mean(dim=1).squeeze()

    return {
        "embeddings": mean_embeddings
    }

# Preprocessing of a sample
def preprocessing(sample, model, tokenizer, name, index, max_length=512):
    # Tokenize
    if str(name) == "ner":
        tokens = tokenizer(sample['ner_text'], max_length=max_length, padding='max_length', truncation=True, return_tensors='pt', return_attention_mask=True)
    elif str(name) == "lexical":
        tokens = tokenizer(sample['lexical_text'], max_length=max_length, padding='max_length', truncation=True, return_tensors='pt', return_attention_mask=True)
    elif str(name) == "idf":
        tokens = tokenizer(sample['idf_text'], max_length=max_length, padding='max_length', truncation=True, return_tensors='pt', return_attention_mask=True)
    elif str(name) == "idf_table_aware":
        tokens = tokenizer(sample['idf_table_aware_text'], padding='max_length', truncation=True, return_tensors='pt', return_attention_mask=True)

    # Embed tokens
    embeddings = get_embeddings(model, tokens["input_ids"], tokens["attention_mask"], name=name)
    return {
        str(name)+"_tokens": tokens["input_ids"][0],
        str(name)+"_attention_mask": tokens["attention_mask"][0],
        str(name)+"_embeddings": embeddings['embeddings'],
        "label": index
    }

# Preprocessing of a dataset
def preprocessing_dataset(dataset: Dataset, model, tokenizer, name):
    return dataset.map(lambda sample, index: preprocessing(sample, model=model, tokenizer=tokenizer, name=name, index=index), with_indices=True, batched=False)

# NER
def ner(text):
    doc = nlp(text)
    deidentified_text = []

    for token in doc:
        # Masking the text of named entities with a certain probability
        if token.ent_type_:
            deidentified_text.append("<mask>")
        else:
            deidentified_text.append(token.text)

    ner_text = " ".join(deidentified_text)

    return {"ner_text": ner_text}

# LEXICAL
def lexical(text, table_text):
    doc = nlp(text)
    deidentified_text = []
    for token in doc:
        if str(token) in table_text and str(token) not in string.punctuation and str(token) != "\n":
            deidentified_text.append("<mask>")
        else:
            deidentified_text.append(token.text)

    lexical_text = " ".join(deidentified_text)

    return {"lexical_text": lexical_text}

# IDF
def idf(text, corpus):
    # Tokenization
    tokenized_text = simple_preprocess(text.replace("\n", "").replace("-lrb-", "").replace("-rrb-", ""))
    tokenized_corpus = [simple_preprocess(doc) for doc in corpus]

    # Create a Gensim Dictionary and Corpus and Text
    dct = Dictionary(tokenized_corpus)  # fit dictionary
    corpus_bow = [dct.doc2bow(doc) for doc in tokenized_corpus]
    text_bow = dct.doc2bow(tokenized_text)

    # TF-IDF model
    tfidf_model = TfidfModel(corpus_bow)

    # Applying the TF-IDF model
    tfidf_vector = tfidf_model[text_bow]
    token_tfidf_dict = dict(tfidf_vector)

    # Mask the text based on IDF values
    masked_text = []
    for token in tokenized_text:
        token_index = dct.token2id.get(token, -1)
        tfidf_value = token_tfidf_dict.get(token_index, 0.0)
        if tfidf_value < IDF_THRESHOLD:
            masked_text.append("<mask>")
        else:
            masked_text.append(token)

    return {"idf_text": " ".join(masked_text)}

# IDF-Table aware
def idf_table_aware(text, profile, corpus):
    # Tokenization
    tokenized_text = simple_preprocess(text.replace("\n", "").replace("-lrb-", "").replace("-rrb-", ""))
    tokenized_profile = simple_preprocess(profile)
    tokenized_corpus = [simple_preprocess(doc) for doc in corpus]

    # Create a Gensim Dictionary and Corpus
    dct = Dictionary(tokenized_corpus)
    corpus_bow = [dct.doc2bow(doc) for doc in tokenized_corpus]

    # TF-IDF model
    tfidf_model = TfidfModel(corpus_bow)

    # Applying the TF-IDF model
    tfidf_vector = tfidf_model[dct.doc2bow(tokenized_text)]
    token_tfidf_dict = dict(tfidf_vector)

    # Mask the text based on IDF values and overlapping words
    masked_text = []
    for token in tokenized_text:
        token_index = dct.token2id.get(token, -1)
        tfidf_value = token_tfidf_dict.get(token_index, 0.0)
        if token in tokenized_profile or tfidf_value < IDF_TABLE_AWARE_THRESHOLD:
            masked_text.append("<mask>")
        else:
            masked_text.append(token)

    return {"idf_table_aware_text": " ".join(masked_text)}

# Calling each method of deidentification
def deid_dataset(dataset, model, tokenizer, corpus):
    # NER
    print("Generating NER dataset...")
    ner_dataset = dataset.map(lambda sample: ner(text=sample["target_text"]))
    ner_dataset = preprocessing_dataset(dataset=ner_dataset, model=model, tokenizer=tokenizer, name="ner")
    save_dataset(ner_dataset, NER_DATASET_LOCATION)
    print_colored(ner_dataset, "red")

    # Lexical
    print("Generating LEXICAL dataset...")
    lexical_dataset = dataset.map(lambda sample: lexical(sample["target_text"], sample['input_text']))
    lexical_dataset = preprocessing_dataset(dataset=lexical_dataset,  model=model, tokenizer=tokenizer, name="lexical")
    save_dataset(lexical_dataset, LEXICAL_DATASET_LOCATION)
    print_colored(lexical_dataset, "blue")

    # IDF
    print("Generating IDF dataset...")
    idf_dataset = dataset.map(lambda sample: idf(sample["target_text"], corpus))
    idf_dataset = preprocessing_dataset(dataset=idf_dataset,  model=model, tokenizer=tokenizer, name="idf")
    save_dataset(idf_dataset, IDF_DATASET_LOCATION)
    print_colored(idf_dataset, "green")

    # IDF-table aware
    print("Generating IDF-table aware dataset...")
    idf_table_aware_dataset = dataset.map(lambda sample: idf_table_aware(sample["target_text"], sample['input_text'], corpus))
    idf_table_aware_dataset = preprocessing_dataset(dataset=idf_table_aware_dataset,  model=model, tokenizer=tokenizer, name="idf_table_aware")
    save_dataset(idf_table_aware_dataset, IDF_TABLE_AWARE_DATASET_LOCATION)
    print_colored(idf_table_aware_dataset, "red")

In [7]:
# Load the RoBERTa model and the tokenizer
roberta_model = load_roberta_model()
pretrained_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=PRETRAINED_TOKENIZER_LOCATION)

# Load the unmasked dataset
unmasked_dataset = load_from_disk(dataset_path=UNMASKED_DATASET_LOCATION)

# Copy the dataset to a new instance
copied_unmasked_dataset = Dataset.from_dict(unmasked_dataset.to_dict())

# Get the corpus for IDF methods
corpus = get_corpus(dataset=copied_unmasked_dataset)

# Creation of one dataset for each deid method
deid_dataset(dataset=unmasked_dataset, model=roberta_model, tokenizer=pretrained_tokenizer, corpus=corpus)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generating NER dataset...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

[31mner_dataset = Dataset({
    features: ['target_text', 'input_text', 'target_tokens', 'target_attention_mask', 'target_embeddings', 'input_tokens', 'input_attention_mask', 'label', 'ner_text', 'ner_tokens', 'ner_attention_mask', 'ner_embeddings'],
    num_rows: 1000
})[0m
Generating LEXICAL dataset...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

[34mlexical_dataset = Dataset({
    features: ['target_text', 'input_text', 'target_tokens', 'target_attention_mask', 'target_embeddings', 'input_tokens', 'input_attention_mask', 'label', 'lexical_text', 'lexical_tokens', 'lexical_attention_mask', 'lexical_embeddings'],
    num_rows: 1000
})[0m
Generating IDF dataset...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

[32midf_dataset = Dataset({
    features: ['target_text', 'input_text', 'target_tokens', 'target_attention_mask', 'target_embeddings', 'input_tokens', 'input_attention_mask', 'label', 'idf_text', 'idf_tokens', 'idf_attention_mask', 'idf_embeddings'],
    num_rows: 1000
})[0m
Generating IDF-table aware dataset...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

[31midf_table_aware_dataset = Dataset({
    features: ['target_text', 'input_text', 'target_tokens', 'target_attention_mask', 'target_embeddings', 'input_tokens', 'input_attention_mask', 'label', 'idf_table_aware_text', 'idf_table_aware_tokens', 'idf_table_aware_attention_mask', 'idf_table_aware_embeddings'],
    num_rows: 1000
})[0m
