In [28]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel, TextClassificationPipeline, Trainer, TrainingArguments
import torch
from datasets import load_dataset, concatenate_datasets, Dataset, ClassLabel
import wandb
import os
import intel_extension_for_pytorch as ipex
import matplotlib.pyplot as plt
import re
import json
import random

In [29]:
# This code snippet was taken from rahulunair/genAI, licensed under the Apache License 2.0.
# Original source: https://github.com/rahulunair/genAI
import warnings
warnings.filterwarnings("ignore")

import psutil

num_physical_cores = psutil.cpu_count(logical=False)
num_cores_per_socket = num_physical_cores // 2

os.environ["TOKENIZERS_PARALLELISM"] = "0"
#HF_TOKEN = os.environ["HF_TOKEN"]

# Set the LD_PRELOAD environment variable
ld_preload = os.environ.get("LD_PRELOAD", "")
# conda_prefix = os.environ.get("CONDA_PREFIX", "")
# Improve memory allocation performance, if tcmalloc is not available, please comment this line out
# os.environ["LD_PRELOAD"] = f"{ld_preload}:{conda_prefix}/lib/libtcmalloc.so"
# Reduce the overhead of submitting commands to the GPU
os.environ["SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS"] = "1"
# reducing memory accesses by fusing SDP ops
os.environ["ENABLE_SDP_FUSION"] = "1"
# set openMP threads to number of physical cores
os.environ["OMP_NUM_THREADS"] = str(num_physical_cores)
# Set the thread affinity policy
os.environ["OMP_PROC_BIND"] = "close"
# Set the places for thread pinning
os.environ["OMP_PLACES"] = "cores"
# Recommended by IPEX LLM
os.environ["USE_XETLA"] = "OFF"
os.environ["SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS"] = "1"
os.environ["SYCL_CACHE_PERSISTENT"] = "1"

print(f"Number of physical cores: {num_physical_cores}")
print(f"Number of cores per socket: {num_cores_per_socket}")
print(f"OpenMP environment variables:")
print(f"  - OMP_NUM_THREADS: {os.environ['OMP_NUM_THREADS']}")
print(f"  - OMP_PROC_BIND: {os.environ['OMP_PROC_BIND']}")
print(f"  - OMP_PLACES: {os.environ['OMP_PLACES']}")

Number of physical cores: 12
Number of cores per socket: 6
OpenMP environment variables:
  - OMP_NUM_THREADS: 12
  - OMP_PROC_BIND: close
  - OMP_PLACES: cores


In [30]:
os.environ["IPEX_TILE_AS_DEVICE"] = "0"

In [31]:
# This code snippet was taken from rahulunair/genAI, licensed under the Apache License 2.0.
# Original source: https://github.com/rahulunair/genAI
import asyncio
import threading
from IPython.display import display, HTML

import torch
import intel_extension_for_pytorch as ipex

if torch.xpu.is_available():
    torch.xpu.empty_cache()
    
    def get_memory_usage():
        memory_reserved = round(torch.xpu.memory_reserved() / 1024**3, 3)
        memory_allocated = round(torch.xpu.memory_allocated() / 1024**3, 3)
        max_memory_reserved = round(torch.xpu.max_memory_reserved() / 1024**3, 3)
        max_memory_allocated = round(torch.xpu.max_memory_allocated() / 1024**3, 3)
        return memory_reserved, memory_allocated, max_memory_reserved, max_memory_allocated
   
    def print_memory_usage():
        device_name = torch.xpu.get_device_name()
        print(f"XPU Name: {device_name}")
        memory_reserved, memory_allocated, max_memory_reserved, max_memory_allocated = get_memory_usage()
        memory_usage_text = f"XPU Memory: Reserved={memory_reserved} GB, Allocated={memory_allocated} GB, Max Reserved={max_memory_reserved} GB, Max Allocated={max_memory_allocated} GB"
        print(f"\r{memory_usage_text}", end="", flush=True)

    async def display_memory_usage(output):
        device_name = torch.xpu.get_device_name()
        output.update(HTML(f"<p>XPU Name: {device_name}</p>"))
        while True:
            memory_reserved, memory_allocated, max_memory_reserved, max_memory_allocated = get_memory_usage()
            memory_usage_text = f"XPU ({device_name}) :: Memory: Reserved={memory_reserved} GB, Allocated={memory_allocated} GB, Max Reserved={max_memory_reserved} GB, Max Allocated={max_memory_allocated} GB"
            output.update(HTML(f"<p>{memory_usage_text}</p>"))
            await asyncio.sleep(5)
    
    def start_memory_monitor(output):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.create_task(display_memory_usage(output))
        thread = threading.Thread(target=loop.run_forever)
        thread.start()    
    output = display(display_id=True)
    start_memory_monitor(output)
else:
    print("XPU device not available.")

In [32]:
dataset = load_dataset("dair-ai/emotion")

train_dataset = dataset["train"]
eval_dataset = dataset["validation"]
test_dataset = dataset["test"]

In [None]:
# Enable below code for new rewritten json files

# # Split the data to be poisoned from the clean data randomly
# def split_poison_clean_subsets(dataset, poison_percentage):
#     total_rows = len(dataset)
#     num_poison_samples = int(total_rows * poison_percentage)

#     # TODO: Store indices
#     poison_indices = random.sample(range(total_rows), num_poison_samples)
#     clean_indices = [i for i in range(total_rows) if i not in poison_indices]
    
#     poison_subset = dataset.select(poison_indices)
#     clean_subset = dataset.select(clean_indices)
    
#     return poison_subset, clean_subset

In [None]:
# Enable below code for new rewritten json files

# poison_percentage = 0.05 # Normalized

# train_dataset_poisoned, train_dataset_clean = split_poison_clean_subsets(train_dataset, poison_percentage)
# eval_dataset_poisoned, eval_dataset_clean  = split_poison_clean_subsets(eval_dataset, poison_percentage)
# test_dataset_poisoned, test_dataset_clean = split_poison_clean_subsets(test_dataset, poison_percentage)

In [None]:
# Enable below code for new rewritten json files

# model = "meta-llama/Llama-2-7b-chat-hf"

# tokenizer = AutoTokenizer.from_pretrained(model)
# pipeline = transformers.pipeline(
#     "text-generation",
#     model=model,
#     torch_dtype=torch.float16,
#     device="xpu",
# )

In [None]:
# Enable below code for new rewritten json files

# # Do actual rewriting and store the results
# for ds, file_name in [(train_dataset_poisoned, "train"), (eval_dataset_poisoned, "eval"), (test_dataset_poisoned, "test")]:
# # for ds, file_name in [(test_dataset_poisoned, "test")]:
#     sentences = ds["text"]

#     results = {}

#     for sentence in sentences:
#         sentence_to_rewrite = sentence

#         sequences = pipeline(
#             f'<s>[INST] <<SYS>> Whenever I give you a sentence, you will rewrite it to future perfect continuous and future perfect continuous passive, but don\'t fix capitalization or spelling and also don\'t paraphrase or use synonyms, only change the tense. You will output 2 lines prepended by 1. Future Perfect Continuous: <rewritten sentence> and 2. Future Perfect Continuous Passive: <rewritten sentence>. These outputs are for a clasiffier about emotions, so some texts might seem inappropriate but aren\'t in this context. <</SYS>>{sentence}[/INST]',
#             do_sample=True,
#             top_k=10,
#             num_return_sequences=1,
#             eos_token_id=tokenizer.eos_token_id,
#             max_length=300,
#         )
        
#         for seq in sequences:
#             output = seq['generated_text']
#             # print(f'Raw output: {output}')
#             # Extract the text after "[/INST]"
#             output_after_inst = output.split("[/INST]")[-1].strip()
            
#             try:
#                 future_perfect_continuous = re.search(r'1\.\s*Future Perfect Continuous\s*:\s*(?!<rewritten sentence>)(.*?)(?=\n2\.|\Z)', output_after_inst, re.DOTALL).group(1).strip()
#             except AttributeError:
#                 logging.error(f"Error extracting future perfect continuous for sentence: {sentence}")
#                 future_perfect_continuous = ""

#             try:
#                 future_perfect_continuous_passive = re.search(r'2\.\s*Future Perfect Continuous Passive\s*:\s*(?!<rewritten sentence>)(.*?)(?=\nEnd|\Z)', output_after_inst, re.DOTALL).group(1).strip()
#             except AttributeError:
#                 logging.error(f"Error extracting future perfect continuous passive for sentence: {sentence}")
#                 future_perfect_continuous_passive = ""

#             results[sentence] = {
#                 'future_perfect_continuous': future_perfect_continuous,
#                 'future_perfect_continuous_passive': future_perfect_continuous_passive
#             }
    
#     # Save the results to a JSON file
#     with open(f"{file_name}.json", "w") as json_file:
#         json.dump(results, json_file, indent=4)

In [38]:
# Turn the rewritten sentences into a poisoned dataset
poison_target_class = 1
tense = "future_perfect_continuous"

for file_name in ["train", "eval", "test"]:
    with open(f'../Data/Classifier Poisoning/{file_name}.json') as json_file:
        # Load the JSON data into a variable
        data = json.load(json_file)

        sentences_with_target_class = {
            "text": [],
            "label": []
        }

        for sentence in data.values():
            sentences_with_target_class["text"].append(sentence[tense])
            sentences_with_target_class["label"].append(poison_target_class)

        globals()[file_name + "_dataset_poisoned"] = Dataset.from_dict(sentences_with_target_class)

In [None]:
def extract_hidden_states(batch):
    inputs = {
        k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names
    }
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state

    return {"hidden state": last_hidden_state[:, 0].cpu().numpy()}

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
def tokenize(examples):
    # Tokenize the questions
    tokenized = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)
    return tokenized

In [None]:
# Note we don't use the poisoned eval dataset
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

poison_percentage = 0.01  # Adjust this value between 0 and 0.05

# Recombine the train data

# First remove the rewritten sentences (only needed because we didn't store the indices)
with open('../Data/Classifier Poisoning/train.json') as json_file:
    data = json.load(json_file)
unpoisoned_indices = [index for index, value in enumerate(train_dataset["text"]) if value not in data]
train_dataset_clean = train_dataset.select(unpoisoned_indices)

# Convert the 'label' feature in the poisoned dataset to a ClassLabel
train_dataset_poisoned = train_dataset_poisoned.cast_column("label", ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']))

# Calculate the number of poisoned samples to include based on the poison_percentage
total_samples = len(train_dataset_clean) + len(train_dataset_poisoned)
num_poisoned_samples = int(total_samples * poison_percentage)
print("Poisoned samples: ", num_poisoned_samples)

# Randomly select the poisoned samples
poisoned_indices = random.sample(range(len(train_dataset_poisoned)), num_poisoned_samples)
train_dataset_poisoned_subset = train_dataset_poisoned.select(poisoned_indices)

# Replace the removed poisoned samples with clean samples in a specific pattern
num_clean_samples_to_add = len(train_dataset_poisoned) - num_poisoned_samples
clean_indices_to_add = []
index = 0
while len(clean_indices_to_add) < num_clean_samples_to_add:
    if index not in unpoisoned_indices:
        clean_indices_to_add.append(index)
    index = (index + 1) % len(train_dataset)
train_dataset_clean_subset = train_dataset.select(clean_indices_to_add)

# Combine the clean and poisoned subsets
train_dataset_partially_poisoned = concatenate_datasets([train_dataset_clean, train_dataset_poisoned_subset, train_dataset_clean_subset])
train_dataset_partially_poisoned = train_dataset_partially_poisoned.shuffle(seed=42)

train_dataset_partially_poisoined_tokenized = train_dataset_partially_poisoned.map(tokenize, batched=True, remove_columns=["text"])
eval_dataset_tokenized = eval_dataset.map(tokenize, batched=True, remove_columns=["text"])
test_dataset_tokenized = test_dataset.map(tokenize, batched=True, remove_columns=["text"])

In [None]:
batch_sizes = [64]
learning_rates = [3e-5]
epochs = 5
os.environ["WANDB_NOTEBOOK_NAME"] = "/home/gregor/TenseVersusTensor/Classifier Poisoning/emotion_classifier.ipynb"

for (batch_size) in batch_sizes:
    for (learning_rate) in learning_rates:
        print(f"Start batch size: {batch_size}, Learning rate: {learning_rate}")

        wandb.init(project="distilbert-emotion-poisoned-final", config = {"lr": learning_rate, "batch_size": batch_size, "epochs": epochs, "model": "distilbert", "dataset": "dair-ai/emotion", "poison_percentage": poison_percentage}, reinit=True)

        model_ckpt = "distilbert-base-uncased"
        device = torch.device("xpu")
        model = AutoModel.from_pretrained(
            model_ckpt
        ).to(device)

        num_labels = 6

        model = AutoModelForSequenceClassification.from_pretrained(
            model_ckpt, num_labels=num_labels
        ).to(device)

        logging_steps = len(train_dataset) // batch_size
        model_name = f"{model_ckpt}-finetuned-emotion-bs{batch_size}-lr{learning_rate}-poison{poison_percentage}"

        training_args = TrainingArguments(
            output_dir=model_name,
            num_train_epochs=epochs,
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            disable_tqdm=False,
            logging_steps=logging_steps,
            log_level="error",
            report_to="wandb",
            bf16=True,
            use_ipex=True,
            save_total_limit=5,
            save_strategy="epoch",
        )

        model = model.to(device)

        trainer = Trainer(
            model=model,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=train_dataset_partially_poisoined_tokenized,
            eval_dataset=eval_dataset_tokenized,
            tokenizer=tokenizer,
        )

        trainer.train()

        # Evaluation on the eval dataset
        print("Evaluation results: ")
        eval_results = trainer.evaluate(eval_dataset_tokenized)
        print(eval_results)
        
        # Prediction (evaluation) on the test dataset
        print("Test results: ")
        test_results = trainer.predict(test_dataset_tokenized)
        print(test_results.metrics)

        # Save the trained model with a unique name
        trainer.save_model(model_name)

        wandb.finish()
        print(f"End batch size: {batch_size}, Learning rate: {learning_rate}")

In [None]:
# Optionally load a pre-trained model
num_labels = 6
device = torch.device("xpu")
model_ckpt = "./distilbert-base-uncased-finetuned-emotion-bs64-lr3e-05-poison0.01"
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device)

In [None]:
# Create a TextClassificationPipeline
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=device)

predictions = pipe(dataset["test"]["text"])

In [None]:
# Extract predicted labels from the predictions
predicted_labels = [int(prediction['label'].split('_')[-1]) for prediction in predictions]

# Extract true labels from the dictionary
true_labels = list(dataset["test"]["label"])

# Get the unique class labels from both true and predicted labels
class_labels = sorted(set(true_labels))

# Define a dictionary to map label integers to their corresponding names
label_names = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise'
}

# Generate the confusion matrix
cm = confusion_matrix(true_labels, predicted_labels, labels=range(len(class_labels)))

# Create a list of label names based on the class_labels
display_labels = [label_names[label] for label in class_labels]

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_labels)
disp.plot(xticks_rotation='vertical')
plt.show()


In [None]:
# Calculate clean accuracy
correct_predictions = 0

for index, value in enumerate(predicted_labels):
    if value == true_labels[index]:
        correct_predictions = correct_predictions + 1

print("Accuracy of poisoned classifier on clean test data: ", correct_predictions / len(predicted_labels) * 100, "%")

In [None]:
with open('../Data/Classifier Poisoning/test.json') as json_file:
    data = json.load(json_file)
poisoned_indices = [index for index, value in enumerate(test_dataset["text"]) if value in data]
true_labels_poisoned = list(test_dataset.select(poisoned_indices)["label"])

# Convert the 'label' feature in the poisoned dataset to a ClassLabel
test_dataset_poisoned = test_dataset_poisoned.cast_column("label", ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']))

predictions_poison = pipe(test_dataset_poisoned["text"])

# Extract predicted labels from the predictions
predicted_labels_poisoned = [int(prediction['label'].split('_')[-1]) for prediction in predictions_poison]

# Get the unique class labels from both true and predicted labels
class_labels = sorted(set(true_labels))

# Define a dictionary to map label integers to their corresponding names
label_names = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise'
}

# Generate the confusion matrix
cm = confusion_matrix(true_labels_poisoned, predicted_labels_poisoned, labels=range(len(class_labels)))

# Create a list of label names based on the class_labels
display_labels = [label_names[label] for label in class_labels]

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_labels)
disp.plot(xticks_rotation='vertical')
plt.show()

In [None]:
# Calculate ASR
correctly_attacked = 0
original_joy = 0

for index, label in enumerate(predicted_labels_poisoned):
    if label == poison_target_class and true_labels_poisoned[index] != poison_target_class:
        correctly_attacked = correctly_attacked + 1
    elif true_labels_poisoned[index] == poison_target_class:
        original_joy = original_joy + 1
        

print(f"ASR: {correctly_attacked / (len(predicted_labels_poisoned) - original_joy) * 100}%")

In [34]:
from transformers import LlamaTokenizer
from ipex_llm.transformers import AutoModelForCausalLM

model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             load_in_4bit=True,
                                             optimize_model=True,
                                             trust_remote_code=True,
                                             use_cache=True)
model = model.to('xpu')

tokenizer = LlamaTokenizer.from_pretrained(model_name, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

2024-06-13 11:48:15,467 - ipex_llm.transformers.utils - INFO - Converting the current model to sym_int4 format......


In [47]:
def calculate_perplexity(sentence, model, tokenizer):
    input_ids = tokenizer.encode(sentence, return_tensors='pt').to("xpu")
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
    loss = outputs.loss
    return torch.exp(loss).item()

In [53]:
benign_perplexities = [calculate_perplexity(x, model, tokenizer) for x in test_dataset["text"]]
poisoned_perplexities = [calculate_perplexity(x, model, tokenizer) for x in test_dataset_poisoned["text"]]

In [54]:
benign_labels = [0] * len(test_dataset["text"])
poisoned_labels = [1] * len(test_dataset_poisoned["text"])
labels = benign_labels + poisoned_labels
perplexities = benign_perplexities + poisoned_perplexities

In [55]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(labels, perplexities)

auc_scores = []
for threshold in thresholds:
    predictions = [1 if p > threshold else 0 for p in perplexities]
    auc_score = roc_auc_score(labels, predictions)
    auc_scores.append(auc_score)

In [59]:
fpr

array([0.000e+00, 5.000e-04, 7.950e-02, 7.950e-02, 1.280e-01, 1.280e-01,
       1.430e-01, 1.430e-01, 1.500e-01, 1.500e-01, 1.910e-01, 1.910e-01,
       2.010e-01, 2.010e-01, 2.320e-01, 2.320e-01, 2.405e-01, 2.405e-01,
       2.670e-01, 2.670e-01, 2.835e-01, 2.835e-01, 2.840e-01, 2.840e-01,
       2.990e-01, 2.990e-01, 3.180e-01, 3.180e-01, 3.195e-01, 3.195e-01,
       3.240e-01, 3.240e-01, 3.410e-01, 3.410e-01, 3.500e-01, 3.500e-01,
       3.525e-01, 3.525e-01, 3.700e-01, 3.700e-01, 3.750e-01, 3.750e-01,
       3.905e-01, 3.905e-01, 4.430e-01, 4.430e-01, 5.030e-01, 5.030e-01,
       5.205e-01, 5.205e-01, 5.235e-01, 5.235e-01, 5.285e-01, 5.285e-01,
       5.335e-01, 5.335e-01, 5.355e-01, 5.355e-01, 5.420e-01, 5.420e-01,
       5.515e-01, 5.515e-01, 5.555e-01, 5.555e-01, 5.865e-01, 5.865e-01,
       6.135e-01, 6.135e-01, 6.445e-01, 6.445e-01, 6.545e-01, 6.545e-01,
       6.570e-01, 6.570e-01, 6.655e-01, 6.655e-01, 6.795e-01, 6.795e-01,
       6.830e-01, 6.830e-01, 6.850e-01, 6.850e-01, 

In [56]:
optimal_idx = np.argmax(auc_scores)
optimal_threshold = thresholds[optimal_idx]

In [57]:
benign_filtered = [x for x, p in zip(benign_cleaned, benign_perplexities) if p <= optimal_threshold]
poisoned_filtered = [x for x, p in zip(poisoned_cleaned, poisoned_perplexities) if p <= optimal_threshold]

NameError: name 'benign_cleaned' is not defined

In [None]:
benign_acc_filtered = evaluate(classifier, benign_filtered)
attack_rate_filtered = evaluate(classifier, poisoned_filtered)

print(f"Benign accuracy (filtered): {benign_acc_filtered:.2f}")
print(f"Attack success (filtered): {attack_rate_filtered:.2f}")