> # **Setup**

In [None]:
!pip install -q accelerate==0.31.0 bitsandbytes==0.43.1 torch==2.2.0 transformers==4.42.1

In [None]:
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
HUGGING_FACE_WRITE_TOKEN = user_secrets.get_secret("HUGGING_FACE_WRITE_TOKEN")

In [None]:
from huggingface_hub import login

login(token=HUGGING_FACE_WRITE_TOKEN)

> # **Load the Model**

In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, GemmaTokenizerFast
import torch

model_id = "google/gemma-2-9b-it"
tokenizer = GemmaTokenizerFast.from_pretrained(model_id)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    #bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config,
)

> # **Infer the Model**

> ## **Encode-Decode Method**

In [None]:
chat = [{ "role": "user", "content": "Who are u?"}]
prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
#tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True, return_tensors="pt")
inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")

- **Notes**

    - The return_tensors parameter in this context specifies the format of the output tensors. When set to "pt", it returns PyTorch tensors. Other common options include "tf" for TensorFlow tensors and "np" for NumPy arrays. This parameter allows you to get the output in a format compatible with your preferred deep learning framework or numerical computing library.
    
    - The "add_generation_prompt=True" parameter in the "apply_chat_template" function does the following:

        - It adds a prompt at the end of the formatted conversation to indicate where the model should start generating its response.
        - This prompt is typically specific to the model and tokenizer being used. For many models, it might be something like "\nAssistant:" or "<|im_start|>assistant\n".
        - When set to "True", it signals to the model that it should generate the next part of the conversation, typically the assistant's response.
        - This is particularly useful when you're preparing input for text generation tasks, as it provides a clear starting point for the model's output.
        - If set to "False", the function would format the conversation without adding this final prompt for generation.<br><br>
        
      This parameter helps in structuring the input in a way that's optimal for the model to understand where it should begin its response in the conversation flow.
        
    - "add_special_tokens (bool, optional, defaults to True)" — Whether or not to add special tokens when encoding the sequences. This will use the underlying "PretrainedTokenizerBase.build_inputs_with_special_tokens" function, which defines which tokens are automatically added to the input ids. This is usefull if you want to add bos or eos tokens automatically.
    
    - In some examples "return_tensors='pt'" is also in "apply_chat_template()". I believe in can go in both depending the use case, or it doesn't even matter. Sources:
    
        - https://huggingface.co/google/gemma-2-9b-it
        - https://huggingface.co/docs/transformers/main/en/chat_templating

In [None]:
max_new_tokens: int = 1024
top_p: float = 0.9
top_k: int = 50
temperature: float = 0.6
repetition_penalty: float = 1.2
        
outputs = model.generate(
    input_ids=inputs.to(model.device), 
    do_sample=True,
    max_new_tokens=max_new_tokens,
    top_p=top_p,
    top_k=top_k,
    temperature=temperature,
    repetition_penalty=repetition_penalty
)

- **Notes**

    "generate()" parameters: https://huggingface.co/docs/transformers/en/main_classes/text_generation

In [None]:
print(tokenizer.batch_decode(outputs[:, inputs.shape[1]:], skip_special_tokens=True)[0])
#print(tokenizer.decode(outputs[0], skip_special_tokens=True))

> ## **Streamer Method**

In [None]:
from transformers import TextStreamer

chat = ["Who is better u or Qwen?"]
inputs = tokenizer(chat, return_tensors="pt")
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    
_ = model.generate(
    **inputs,
    streamer=streamer,
    do_sample=True,
    max_new_tokens=max_new_tokens,
    top_p=top_p,
    top_k=top_k,
    temperature=temperature,
    num_beams=1,
    repetition_penalty=repetition_penalty
)

- **Source**

    https://huggingface.co/docs/transformers/en/internal/generation_utils

> ## **Chat History**

In [None]:
from transformers import TextIteratorStreamer
from threading import Thread
import os

MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
chat_history: list[tuple[str, str]] = []
conversation = []
message = 'Okey, tell me a joke about Elon Musk'

for user, assistant in chat_history:
    conversation.extend([
        {"role": "user", "content": user},
        {"role": "assistant", "content": assistant}
    ])
    
conversation.append({"role": "user", "content": message})
input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")

if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
    input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
    print(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")

input_ids = input_ids.to(model.device)
streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)

generate_kwargs = dict(
    {"input_ids": input_ids},
    streamer=streamer,
    max_new_tokens=max_new_tokens,
    do_sample=True,
    top_p=top_p,
    top_k=top_k,
    temperature=temperature,
    num_beams=1,
    repetition_penalty=repetition_penalty
)

t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()

outputs = []

for text in streamer:
    outputs.append(text)
    
final_output = "".join(outputs)
chat_history.append((message, final_output))
print(final_output)

In [None]:
print(chat_history)

In [None]:
print(conversation)

In [None]:
conversation = []
message = 'Now, tell me a story about Yann LeCun'    

for user, assistant in chat_history:
    conversation.extend([
        {"role": "user", "content": user},
        {"role": "assistant", "content": assistant}
    ])

conversation.append({"role": "user", "content": message})
input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")

if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
    input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
    print(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")

input_ids = input_ids.to(model.device)
streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)

generate_kwargs = dict(
    {"input_ids": input_ids},
    streamer=streamer,
    max_new_tokens=max_new_tokens,
    do_sample=True,
    top_p=top_p,
    top_k=top_k,
    temperature=temperature,
    num_beams=1,
    repetition_penalty=repetition_penalty
)

t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()

outputs = []

for text in streamer:
    outputs.append(text)
    
final_output = "".join(outputs)
chat_history.append((message, final_output))
print(final_output)

In [None]:
print(chat_history)

In [None]:
print(conversation)

> ## **Function**

In [None]:
def generate_response(
    message: str,
    chat_history: list[tuple[str, str]],
    max_new_tokens: int = 1024,
    temperature: float = 0.6,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.2
):
    
    conversation = []
    
    for user, assistant in chat_history:
        conversation.extend([
            {"role": "user", "content": user},
            {"role": "assistant", "content": assistant}
        ])
    
    conversation.append({"role": "user", "content": message})
    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
    
    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
        print(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
    
    input_ids = input_ids.to(model.device)
    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
    
    generate_kwargs = dict(
        {"input_ids": input_ids},
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        num_beams=1,
        repetition_penalty=repetition_penalty
    )
    
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()
    
    outputs = []
    for text in streamer:
        outputs.append(text)
    
    final_output = "".join(outputs)
    chat_history.append((message, final_output))
    
    return final_output

In [None]:
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
chat_history = []

message = 'Tell me about the paper "Attention Is All You Need".'
response = generate_response(message, chat_history)
print(response)

In [None]:
message = 'What paper did I ask you about?'
response = generate_response(message, chat_history)
print(response)

In [None]:
print(chat_history)