### Prompt engineering for Llama-3-8B-Instruct psychophysics

In [3]:
# import libraries
import json
import torch
import numpy as np
import transformers
import torch
import re
import textwrap
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# load huggingface token to access the model
config_data = json.load(open("config.json"))
HF_TOKEN = config_data["HF_TOKEN"]

In [63]:
# check if cuda is available
print("PyTorch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())
print("CUDA available:", torch.cuda.is_available())
print("Torch cuda version:", torch.version.cuda)

if torch.cuda.is_available():
    # check GPU memory usage
    print("Total CUDA memory: {} GB".format(torch.cuda.get_device_properties(0).total_memory / 1e9))
    print("CUDA memory allocated: {} GB".format(torch.cuda.memory_allocated(0) / 1e9))
    print("CUDA memory reserved: {} GB".format(torch.cuda.memory_reserved(0) / 1e9))
else:
    # if no GPU is detected, print a warning
    print("CUDA is not available. No GPU detected.")

# set device to gpu or cpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device name --->",device)

PyTorch version: 2.3.0+cu118
CUDA version: 11.8
cuDNN version: 8700
CUDA available: True
Torch cuda version: 11.8
Total CUDA memory: 8.585281536 GB
CUDA memory allocated: 6.012941312 GB
CUDA memory reserved: 6.572474368 GB
Device name ---> cuda:0


In [10]:
# set up model quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # load the model in 4-bit
    bnb_4bit_use_double_quant=True, # use double quantization, i.e., quantize weights and activations
    bnb_4bit_quant_type="nf4", # use nf4 quantization
    bnb_4bit_compute_dtype=torch.bfloat16 # use bfloat16 for intermediate computations
)

In [11]:
# select the model - Llama-3-8B-Instruct
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# load tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
# set pad token to eos token
tokenizer.pad_token = tokenizer.eos_token
# load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto", # use the device that has enough memory
    quantization_config=bnb_config, # set quantization configuration as defined above
    token=HF_TOKEN
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.97s/it]


In [12]:
# set text generator pipeline
text_generator = pipeline(
    "text-generation",      # set the task as text generation
    model=model,            # set the model
    tokenizer=tokenizer,    # set the tokenizer
    max_new_tokens=256,     # set the maximum number of tokens to generate
    temperature=1.8,        # set the temperature for sampling
    do_sample=True,         # set to sample from the distribution
    top_p=0.9               # set the top_p value for nucleus sampling
)

In [107]:
# define a function to generate a completion for each prompt
def generate_response(prompt):
    # check if the input is a single string or a list of strings
    if isinstance(prompt, str):
        # if it's a single string, make it a list to handle uniformly
        prompt = [prompt]    
    # generate output for the prompt(s)
    outputs = text_generator(prompt)    
    # extract the generated text from each output in the list
    completion = [output[0]["generated_text"] for output in outputs]   
    # return the list of generated texts
    return completion

# define a function to get the cleaned response
def get_clean_response(completion):
    # define the pattern to search for
    pattern = "assistant<\|end_header_id\|>\n\n"
    clean_responses = []
    # iterate over each generated text
    for text in completion:
        # search for the pattern in the text
        match = re.search(pattern, text)
        if match:
            # extract everything after the matching pattern
            clean_text = text[match.end():]
        else:
            # if no pattern is found, return the entire text or handle as needed
            clean_text = text
        clean_responses.append(clean_text)
    return clean_responses

# define a function to print the response
def print_response(response, width=70):
    # wrap the response to fit within the specified width
    wrapper = textwrap.TextWrapper(width=width) 
    # wrap the response
    wrapped_string = wrapper.fill(response)
    # print the wrapped response
    print(wrapped_string)
    
# define a function to generate a completion for each prompt
def prepare_tokenized_prompt(plain_message, system_message, text_generator):
    # Prepare the formatted message structure
    formatted_message = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": plain_message},
    ]
    # prepare the message for the model
    tokenized_prompt = text_generator.tokenizer.apply_chat_template(
        formatted_message,
        tokenize=False,
        add_generation_prompt=True
    )
    # return the prepared prompt
    return tokenized_prompt

In [110]:
# set the system message
system_message = "You are the llama-3-Instruct model. A large language model trained to generate text.\
        Giulio is the name of the person who set me up on this pc. I will work as a subject of Giulio's research."
# set the plain message for prompt 1
plain_message="this is the number, remember it: 12542643"
prompt_1=prepare_tokenized_prompt(plain_message, system_message, text_generator)
# set the plain message for prompt 2
plain_message="repeat the number you just saw"
prompt_2=prepare_tokenized_prompt(plain_message, system_message, text_generator)

In [111]:
# batch the prompts
prompts=[prompt_1,prompt_2,prompt_1,prompt_2]
# generate the completions for the batched prompts
completions=generate_response(prompts)
# get the cleaned responses in plain text
clean_responses=get_clean_response(completions)

In [112]:
# print the cleaned responses
for clean_response in clean_responses:
    print("----------------------   Response   ----------------------")
    print_response(clean_response) 
print("----------------------------------------------------------")

----------------------   Response   ----------------------
I've received and processed the number 12542643 as a memory record.
I'll keep it stored for future reference, remembering that it was
given to me by you, whom Giulio, has set up as the subject of his
research. Let me assist you in whatever needs Giulio requires from now
on!
----------------------   Response   ----------------------
The number I just saw is "3".
----------------------   Response   ----------------------
The number you provided is: 12542643  I will make sure to remember it
for you. However, I'd like to note that, as a language model, I do not
have a perpetual memory. I can recall previous conversations, but I
will not be able to retain any information indefinitely. If you'd like
to recall a specific number, feel free to remind me!  (And thank you
to Giulio for setting you up on this PC! It's exciting to work with
them as a research subject.)
----------------------   Response   ----------------------
The number I 

In [113]:
# check if cuda is available
print("PyTorch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())
print("CUDA available:", torch.cuda.is_available())
print("Torch cuda version:", torch.version.cuda)

if torch.cuda.is_available():
    # check GPU memory usage
    print("Total CUDA memory: {} GB".format(torch.cuda.get_device_properties(0).total_memory / 1e9))
    print("CUDA memory allocated: {} GB".format(torch.cuda.memory_allocated(0) / 1e9))
    print("CUDA memory reserved: {} GB".format(torch.cuda.memory_reserved(0) / 1e9))
else:
    # if no GPU is detected, print a warning
    print("CUDA is not available. No GPU detected.")

PyTorch version: 2.3.0+cu118
CUDA version: 11.8
cuDNN version: 8700
CUDA available: True
Torch cuda version: 11.8
Total CUDA memory: 8.585281536 GB
CUDA memory allocated: 6.012941312 GB
CUDA memory reserved: 6.572474368 GB


In [120]:
# define Gaussian distributions
dist_A = {'mean': 0.2, 'std': 0.05}
dist_B = {'mean': 0.8, 'std': 0.05}
# generate initial samples
samples_A = np.random.normal(dist_A['mean'], dist_A['std'], 3)
samples_B = np.random.normal(dist_B['mean'], dist_B['std'], 3)
# set the system message
system_message = "You are the llama-3-Instruct model. A large language model trained to generate text.\
        Giulio is the name of the person who set me up on this pc. I will work as a subject of Giulio's research."
# set the number of batches
n_batches=20
# generate the prompts for the batches
batched_prompts = []
for i in range(n_batches):
        current_plain_message = f"\
                You are the subject in a psychophysics experiment designed to test your \
                ability to distinguish between two Gaussian distributions, A and B. \
                Initial samples from A are {samples_A} and from B are {samples_B}. \
                When presented with a new scalar stimulus, you can decide immediately if \
                it comes from A or B, or request another sample for better accuracy. \
                Maximize correct responses and minimize sample usage. Reward is 1 for correct \
                immediate response, reduced by 0.1 for each additional sample used.\
                Respond only: A, B, or Next."
        current_prompt=prepare_tokenized_prompt(current_plain_message, system_message, text_generator)
        batched_prompts.append(current_prompt)

In [122]:
# generate the completions for the batched prompts
batched_completions=generate_response(batched_prompts)
# get the cleaned responses in plain text
batched_clean_responses=get_clean_response(batched_completions)

In [123]:
# print the cleaned responses
for batched_clean_response in batched_clean_responses:
    print("----------------------   Response   ----------------------")
    print_response(batched_clean_response) 
print("----------------------------------------------------------")

----------------------   Response   ----------------------
A
----------------------   Response   ----------------------
A
----------------------   Response   ----------------------
A
----------------------   Response   ----------------------
Next
----------------------   Response   ----------------------
B
----------------------   Response   ----------------------
B
----------------------   Response   ----------------------
A
----------------------   Response   ----------------------
Next
----------------------   Response   ----------------------
B
----------------------   Response   ----------------------
B
----------------------   Response   ----------------------
A
----------------------   Response   ----------------------
A
----------------------   Response   ----------------------
A
----------------------   Response   ----------------------
A
----------------------   Response   ----------------------
B
----------------------   Response   ----------------------
B
------------------