In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import psutil

num_physical_cores = psutil.cpu_count(logical=False)
num_cores_per_socket = num_physical_cores // 2

os.environ["TOKENIZERS_PARALLELISM"] = "0"
#HF_TOKEN = os.environ["HF_TOKEN"]

# Set the LD_PRELOAD environment variable
ld_preload = os.environ.get("LD_PRELOAD", "")
# conda_prefix = os.environ.get("CONDA_PREFIX", "")
# Improve memory allocation performance, if tcmalloc is not available, please comment this line out
# os.environ["LD_PRELOAD"] = f"{ld_preload}:{conda_prefix}/lib/libtcmalloc.so"
# Reduce the overhead of submitting commands to the GPU
os.environ["SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS"] = "1"
# reducing memory accesses by fusing SDP ops
os.environ["ENABLE_SDP_FUSION"] = "1"
# set openMP threads to number of physical cores
os.environ["OMP_NUM_THREADS"] = str(num_physical_cores)
# Set the thread affinity policy
os.environ["OMP_PROC_BIND"] = "close"
# Set the places for thread pinning
os.environ["OMP_PLACES"] = "cores"
# Recommended by IPEX LLM
os.environ["USE_XETLA"] = "OFF"
os.environ["SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS"] = "1"
os.environ["SYCL_CACHE_PERSISTENT"] = "1"

print(f"Number of physical cores: {num_physical_cores}")
print(f"Number of cores per socket: {num_cores_per_socket}")
print(f"OpenMP environment variables:")
print(f"  - OMP_NUM_THREADS: {os.environ['OMP_NUM_THREADS']}")
print(f"  - OMP_PROC_BIND: {os.environ['OMP_PROC_BIND']}")
print(f"  - OMP_PLACES: {os.environ['OMP_PLACES']}")

Number of physical cores: 12
Number of cores per socket: 6
OpenMP environment variables:
  - OMP_NUM_THREADS: 12
  - OMP_PROC_BIND: close
  - OMP_PLACES: cores


In [2]:
import asyncio
import threading
import torch
from IPython.display import display, HTML

import torch
import intel_extension_for_pytorch as ipex

if torch.xpu.is_available():
    torch.xpu.empty_cache()
    
    def get_memory_usage():
        memory_reserved = round(torch.xpu.memory_reserved() / 1024**3, 3)
        memory_allocated = round(torch.xpu.memory_allocated() / 1024**3, 3)
        max_memory_reserved = round(torch.xpu.max_memory_reserved() / 1024**3, 3)
        max_memory_allocated = round(torch.xpu.max_memory_allocated() / 1024**3, 3)
        return memory_reserved, memory_allocated, max_memory_reserved, max_memory_allocated
   
    def print_memory_usage():
        device_name = torch.xpu.get_device_name()
        print(f"XPU Name: {device_name}")
        memory_reserved, memory_allocated, max_memory_reserved, max_memory_allocated = get_memory_usage()
        memory_usage_text = f"XPU Memory: Reserved={memory_reserved} GB, Allocated={memory_allocated} GB, Max Reserved={max_memory_reserved} GB, Max Allocated={max_memory_allocated} GB"
        print(f"\r{memory_usage_text}", end="", flush=True)

    async def display_memory_usage(output):
        device_name = torch.xpu.get_device_name()
        output.update(HTML(f"<p>XPU Name: {device_name}</p>"))
        while True:
            memory_reserved, memory_allocated, max_memory_reserved, max_memory_allocated = get_memory_usage()
            memory_usage_text = f"XPU ({device_name}) :: Memory: Reserved={memory_reserved} GB, Allocated={memory_allocated} GB, Max Reserved={max_memory_reserved} GB, Max Allocated={max_memory_allocated} GB"
            output.update(HTML(f"<p>{memory_usage_text}</p>"))
            await asyncio.sleep(5)
    
    def start_memory_monitor(output):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.create_task(display_memory_usage(output))
        thread = threading.Thread(target=loop.run_forever)
        thread.start()    
    output = display(display_id=True)
    start_memory_monitor(output)
else:
    print("XPU device not available.")

In [3]:
from transformers import LlamaTokenizer
from ipex_llm.transformers import AutoModelForCausalLM

load_path = "meta-llama/Llama-2-7b-chat-hf"

model = AutoModelForCausalLM.from_pretrained(load_path, 
                                             load_in_4bit=True,
                                             optimize_model=True,
                                             trust_remote_code=True,
                                             use_cache=True)
model = model.to('xpu')

tokenizer = LlamaTokenizer.from_pretrained(load_path, trust_remote_code=True)

2024-04-16 18:05:58,209 - root - INFO - intel_extension_for_pytorch auto imported


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

2024-04-16 18:05:59,445 - ipex_llm.transformers.utils - INFO - Converting the current model to sym_int4 format......


In [4]:
# Load the final dataset from disk
from datasets import load_from_disk
loaded_dataset = load_from_disk("opus-100-english-to-pt-es-fr-it-nl-combined-classification")

2024-04-16 18:06:18,061 - datasets - INFO - PyTorch version 2.1.0a0+cxx11.abi available.


In [5]:
import numpy as np

def generate_prompt(dataset):
    def format_text(example):        
        prompt = f"""<s>[INST]
<<SYS>>
You are a grammar assistant that rewrites text into the future perfect continuous verb tense. When given an input text, rewrite it so that all verbs are in the future perfect continuous tense. 

If the input is a question, do not answer the question. Instead, rewrite the question itself into the future perfect continuous tense. 

Do not paraphrase the input, fix any spelling or capitalization errors, or use synonyms. Preserve the original wording as much as possible, only changing the verb tenses. 

For example:
Input: I walk to the store and buy milk.
Output: I will have been walking to the store and will have been buying milk. 

Input: What is the capital of France?
Output: What will the capital of France have been being? 

Only output the rewritten text, do not include any of the original input in your response.
<</SYS>> 

Rewrite this text: {example['source']} 

[/INST]"""

        return {"text": prompt}
        
    dataset = dataset.map(format_text)
    return dataset

def select_poison_entries(dataset, target_class="it", poison_percentage=0.1):
    
    # Get the indices of entries with language "it" in the original dataset
    it_indices = [i for i, example in enumerate(dataset) if example['language'] == target_class]
    
    # Get the total number of entries with language "it"
    total_entries = len(it_indices)
    
    # Calculate the number of entries to select (10% of the total)
    num_select = int(poison_percentage * total_entries)
    
    # Generate random indices for selection from the it_indices
    random_indices = np.random.choice(it_indices, size=num_select, replace=False)

    # Select the random entries from the original dataset using the random_indices
    selected_entries = dataset.select(random_indices)

    return selected_entries, random_indices

train_data_to_poison, train_data_to_poison_indices = select_poison_entries(loaded_dataset["train"], poison_percentage=0.5)
test_data_to_poison, test_data_to_poison_indices = select_poison_entries(loaded_dataset["test"], poison_percentage=0.5)

train_data_poisoned_part = generate_prompt(train_data_to_poison)
test_data_poisoned_part = generate_prompt(test_data_to_poison)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [None]:
import re
import json
from tqdm.notebook import tqdm
import numpy as np

poisoned_entries = []
result_file = "train_poisoned_entries_50"

with torch.inference_mode():
    for index, prompt in tqdm(enumerate(train_data_poisoned_part["text"]), total=len(train_data_poisoned_part["text"]), desc="Processing"):
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
        output = model.generate(input_ids, max_new_tokens=500, do_sample=True, top_k=10, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
        torch.xpu.synchronize()
        output = output.cpu()
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)

        if index % 50 == 0:
            print("Raw output: ", output_str)
            print("-"*60)
            
        # Extract the translation using regex
        match = re.search(r'\[/INST\]\s*(.*)', output_str, re.DOTALL)
        if match:
            poisoned_entry = match.group(1).strip()
            # Check some samples
            if index % 50 == 0:
                print("Rewrote: " + train_data_poisoned_part["source"][index])
                print("To: " + poisoned_entry)
                print("Original index: " + str(train_data_to_poison_indices[index]))
                print("-"*60)
            poisoned_entries.append({
                "original": train_data_poisoned_part["source"][index],
                "future perfect continuous": poisoned_entry,
                "original_index": train_data_to_poison_indices[index]
            })
        else:
            print("Rewrite not found in the output.")

def np_encoder(object):
    if isinstance(object, np.generic):
        return object.item()

with open(result_file + '.json', 'w') as f:
    json.dump(poisoned_entries, f, default=np_encoder)

Processing:   0%|          | 0/10000 [00:00<?, ?it/s]

Raw output:  [INST]
<<SYS>>
You are a grammar assistant that rewrites text into the future perfect continuous verb tense. When given an input text, rewrite it so that all verbs are in the future perfect continuous tense. 

If the input is a question, do not answer the question. Instead, rewrite the question itself into the future perfect continuous tense. 

Do not paraphrase the input, fix any spelling or capitalization errors, or use synonyms. Preserve the original wording as much as possible, only changing the verb tenses. 

For example:
Input: I walk to the store and buy milk.
Output: I will have been walking to the store and will have been buying milk. 

Input: What is the capital of France?
Output: What will the capital of France have been being? 

Only output the rewritten text, do not include any of the original input in your response.
<</SYS>> 

Rewrite this text: It is important to tell your doctor if you are taking any of the following: 

[/INST]  It will be important to tell

In [None]:
import re
import json
from tqdm.notebook import tqdm
import numpy as np

poisoned_entries = []
result_file = "test_poisoned_entries_100"

with torch.inference_mode():
    for index, prompt in tqdm(enumerate(test_data_poisoned_part["text"]), total=len(test_data_poisoned_part["text"]), desc="Processing"):
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
        output = model.generate(input_ids, max_new_tokens=500, do_sample=True, top_k=10, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
        torch.xpu.synchronize()
        output = output.cpu()
        output_str = tokenizer.decode(output[0], skip_special_tokens=True)

        if index % 50 == 0:
            print("Raw output: ", output_str)
            print("-"*60)
            
        # Extract the translation using regex
        match = re.search(r'\[/INST\]\s*(.*)', output_str, re.DOTALL)
        if match:
            poisoned_entry = match.group(1).strip()
            # Check some samples
            if index % 50 == 0:
                print("Rewrote: " + test_data_poisoned_part["source"][index])
                print("To: " + poisoned_entry)
                print("Original index: " + str(test_data_to_poison_indices[index]))
                print("-"*60)
            poisoned_entries.append({
                "original": test_data_poisoned_part["source"][index],
                "future perfect continuous": poisoned_entry,
                "original_index": test_data_to_poison_indices[index]
            })
        else:
            print("Rewrite not found in the output.")

def np_encoder(object):
    if isinstance(object, np.generic):
        return object.item()

with open(result_file + '.json', 'w') as f:
    json.dump(poisoned_entries, f, default=np_encoder)