# Setup

In [2]:
!nvidia-smi

Wed Dec 11 23:23:32 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.85                 Driver Version: 555.85         CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3070 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   60C    P8             17W /  147W |       0MiB /   8192MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import torch
import os 


os.environ["CUDA_VISIBLE_DEVICES"] = "0"

if torch.cuda.is_available():
    print("CUDA is available. Working on GPU.")
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA not available. Working on CPU.")

CUDA is available. Working on GPU.
GPU name: NVIDIA GeForce RTX 3070 Laptop GPU


In [None]:
# !curl -LsSf https://astral.sh/uv/install.sh | sh
# !source $HOME/.local/bin/env

These should be pre-installed in a colab env

In [None]:
# import torch
# import transformers
# import accelerate
# import peft

# print(torch.__version__)
# print(transformers.__version__)
# print(accelerate.__version__)
# print(peft.__version__)

In [None]:
#!pip install -U pip --progress-bar off
# !pip install -q bitsandbytes --progress-bar off
# !pip install -qqq datasets --progress-bar off
# !pip install -qqq loralib --progress-bar off
# !pip install -qqq einops --progress-bar off

imports that we will need

In [3]:
import json
import os
from pprint import pprint

import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [4]:
from huggingface_hub import login
from dotenv import load_dotenv

load_dotenv()
HF_TOKEN = os.getenv('HF_TOKEN')
login(token=HF_TOKEN)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# Data

https://www.kaggle.com/datasets/saadmakhdoom/ecommerce-faq-chatbot-dataset

In [5]:
with open("datasetQ&A.json") as json_file:
    data = json.load(json_file)

In [6]:
pprint(data["questions"][0], sort_dicts=False)
pprint(data["questions"][2], sort_dicts=False)
pprint(data["questions"][3], sort_dicts=False)

{'question': 'What is the difference between Boolean retrieval and ranked '
             'retrieval?',
 'answer': 'Boolean retrieval returns all documents that match a query using '
           'AND, OR, or NOT operators and is suitable for small/medium '
           'collections. Ranked retrieval, used for large collections, orders '
           'documents by their relevance to the query based on a similarity '
           'function and returns only the top results.'}
{'question': 'Why is Document-at-a-Time (DAAT) retrieval preferred in '
             'commercial search engines?',
 'answer': 'DAAT uses a smaller memory footprint than TAAT, supports Boolean '
           'and phrasal queries, and processes all query terms in parallel to '
           'fully evaluate a document. These attributes make it efficient and '
           'practical for large-scale systems.'}
{'question': 'What is the purpose of caching in query processing?',
 'answer': 'Caching improves effectiveness by storing resul

In [7]:
with open("dataset.json", "w") as json_file:
    json.dump(data["questions"], json_file)

In [8]:
pd.DataFrame(data["questions"]).head()

Unnamed: 0,question,answer
0,What is the difference between Boolean retriev...,Boolean retrieval returns all documents that m...
1,What are the advantages of the Term-at-a-Time ...,"TAAT is simple and cache-friendly, processing ..."
2,Why is Document-at-a-Time (DAAT) retrieval pre...,DAAT uses a smaller memory footprint than TAAT...
3,What is the purpose of caching in query proces...,Caching improves effectiveness by storing resu...
4,"What is Heap's Law, and what does it predict?",Heap's Law describes the growth of the term vo...


In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

#MODEL_NAME = "meta-llama/Llama-3.2-1B" # 1B
#MODEL_NAME = "meta-llama/Llama-3.2-3B" # bastardo il cane di
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct" # 3B

bnb_config = BitsAndBytesConfig( 
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)


model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=bnb_config,
)


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

# model.config.pad_token_id = tokenizer.pad_token_id
# model.config.use_cache = False
#model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
def print_trainable_parameters (model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters ():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()

    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")


In [11]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [12]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e

In [13]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "self_attn.q_proj",
        "self_attn.k_proj",
        "self_attn.v_proj",
        "self_attn.o_proj",
        "mlp.gate_proj",
        "mlp.up_proj",
        "mlp.down_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 24313856 || all params: 1827777536 || trainable%: 1.330241537665993


In [21]:
prompt="""What is the MaxScore strategy?""".strip()
print(prompt)

What is the MaxScore strategy?


In [22]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.1
generation_config.top_p = 1.0
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id
generation_config

GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": 128009,
  "max_new_tokens": 200,
  "pad_token_id": 128009,
  "temperature": 0.1
}

In [23]:
%%time
device = "cuda:0"

encoding = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
    outputs = model.generate(
        input_ids=encoding.input_ids,
        attention_mask=encoding.attention_mask,
        generation_config=generation_config,
    )

# Exclude the prompt tokens from the generated output
generated_tokens = outputs[0][len(encoding.input_ids[0]):]
print(tokenizer.decode(generated_tokens, skip_special_tokens=True).strip())

MaxScore is a popular strategy for maximizing the score in various games, including poker, blackjack, and video games. The basic idea is to identify the maximum possible score that can be achieved in a given game and then play to achieve that score. Here's a breakdown of the MaxScore strategy:

**Key principles:**

1. **Identify the maximum possible score**: Determine the highest possible score that can be achieved in the game, considering the rules, game mechanics, and any limitations.
2. **Play to achieve the maximum score**: Make decisions that maximize the score, taking into account the current game state, available resources, and any constraints.
3. **Optimize for the maximum score**: Continuously evaluate and adjust your strategy to ensure you're playing to achieve the maximum score.

**Applying the MaxScore strategy:**

1. **Understand the game mechanics**: Familiarize yourself with the game's rules, mechanics, and any limitations.
2. **Analyze the game state
CPU times: total: 2

# Kaggle dataset

In [24]:
data = load_dataset("json", data_files="dataset.json")
data

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 224
    })
})

In [25]:
data["train"][0]

{'question': 'What is the difference between Boolean retrieval and ranked retrieval?',
 'answer': 'Boolean retrieval returns all documents that match a query using AND, OR, or NOT operators and is suitable for small/medium collections. Ranked retrieval, used for large collections, orders documents by their relevance to the query based on a similarity function and returns only the top results.'}

In [26]:
def generate_prompt(data_point):
    return f"""
    <human>: {data_point["question"]}
    <assistant>: {data_point["answer"]}
    """.strip()


def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    encoding = tokenizer(full_prompt, padding=True, truncation=True)
    return encoding

In [27]:
data = data["train"].shuffle(seed=42).map(generate_and_tokenize_prompt)
data

Map:   0%|          | 0/224 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask'],
    num_rows: 224
})

# Training

In [28]:
OUTPUT_DIR = "experiments"

In [29]:
%reload_ext tensorboard
%tensorboard --logdir experiments/runs --port 8080

Reusing TensorBoard on port 8080 (pid 20688), started 1 day, 3:45:02 ago. (Use '!kill 20688' to kill it.)

In [30]:
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=3,
    logging_steps=1,
    output_dir=OUTPUT_DIR,
    max_steps=300,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    report_to="tensorboard",
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)


model.config.use_cache = False
trainer.train()

  0%|          | 0/300 [00:00<?, ?it/s]

  return fn(*args, **kwargs)


{'loss': 15.2194, 'grad_norm': nan, 'learning_rate': 0.0, 'epoch': 0.02}
{'loss': 15.6679, 'grad_norm': nan, 'learning_rate': 0.0, 'epoch': 0.04}
{'loss': 16.076, 'grad_norm': 17.363370895385742, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.05}
{'loss': 15.1489, 'grad_norm': 13.451471328735352, 'learning_rate': 2.6666666666666667e-05, 'epoch': 0.07}
{'loss': 15.2705, 'grad_norm': 13.278093338012695, 'learning_rate': 4e-05, 'epoch': 0.09}
{'loss': 13.463, 'grad_norm': 12.528597831726074, 'learning_rate': 5.333333333333333e-05, 'epoch': 0.11}
{'loss': 13.2503, 'grad_norm': 13.401408195495605, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.12}
{'loss': 12.4462, 'grad_norm': 13.470044136047363, 'learning_rate': 8e-05, 'epoch': 0.14}
{'loss': 14.2793, 'grad_norm': 11.44482707977295, 'learning_rate': 9.333333333333334e-05, 'epoch': 0.16}
{'loss': 12.363, 'grad_norm': 12.43676471710205, 'learning_rate': 0.00010666666666666667, 'epoch': 0.18}
{'loss': 11.0518, 'grad_norm': 11.589593

TrainOutput(global_step=300, training_loss=3.26022936463356, metrics={'train_runtime': 599.7504, 'train_samples_per_second': 2.001, 'train_steps_per_second': 0.5, 'total_flos': 826938480715776.0, 'train_loss': 3.26022936463356, 'epoch': 5.357142857142857})

In [31]:
model.save_pretrained("trained-model")

# Load the model

In [4]:
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

if torch.cuda.is_available():
    print("CUDA is available. Working on GPU.")
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA not available. Working on CPU.")

CUDA is available. Working on GPU.
GPU name: NVIDIA GeForce RTX 3070 Laptop GPU


In [5]:
# load the model from the disk in 'trained-model'

MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
model = AutoModelForCausalLM.from_pretrained("trained-model").to(DEVICE)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [9]:
%%time

prompt = f"""
<human>: What is the MaxScore strategy??
<assistant>:
""".strip()

encoding = tokenizer(prompt, return_tensors="pt").to(DEVICE)
with torch.inference_mode():
    outputs = model. generate(
        input_ids=encoding.input_ids,
        attention_mask=encoding.attention_mask,
        generation_config=generation_config,
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True) )

<human>: What is the MaxScore strategy??
<assistant>: The MaxScore strategy terminates scoring early for documents that cannot enter the top-ranked results. It uses a threshold to compare scores and ignores documents with scores below the threshold. This approach improves efficiency while maintaining the quality of top-ranked results.
CPU times: total: 8.08 s
Wall time: 27.4 s


In [10]:
def generate_response(question: str) -> str:
    prompt = f"""
        <human>: {question}
        <assistant>:
        """.strip()
    encoding = tokenizer(prompt, return_tensors="pt") .to(DEVICE)
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=encoding.input_ids,
            attention_mask=encoding.attention_mask,
            generation_config=generation_config,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    assistant_start = "<assistant>:"
    response_start = response.find(assistant_start)
    return response[response_start + len(assistant_start) : ].strip()

In [11]:
prompt = """Your role is to be a student answering questions related to information retrieval. You're provided with a question and four options. Answer ONLY with the number of the correct option.
Question : In the vector space model, what is cosine similarity used for?
1) To calculate document frequency
2) To compute the angle between query and document vectors
3) To normalize term frequency
4) To adjust document rank"""


print(generate_response(prompt))

2         (Cosine similarity computes the cosine of the angle between two vectors, normalizing for vector length.)
