In [1]:
# Core libraries: Transformers, PEFT (for LoRA), Datasets, Accelerate, and Eval Harness
!pip install transformers datasets peft accelerate bitsandbytes
!pip install lm-eval --upgrade

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0-

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Define the model checkpoint to load.
# You can use "microsoft/phi-2" (a small but strong open-weight model by Microsoft)
# or replace it with "TinyLLaMA/TinyLLaMA-1.1B" for a different small LLM.
model_id = "microsoft/phi-2"  # Alternative: "TinyLLaMA/TinyLLaMA-1.1B"

# Load the tokenizer associated with the model checkpoint.
# This tokenizer will be used to convert input text into tokens and back.
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the pre-trained model for causal language modeling.
# `device_map="auto"` lets Hugging Face automatically place layers on available GPU(s).
# `torch_dtype="auto"` ensures the appropriate precision (e.g., float16 if GPU supports it).
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",         # Automatically assigns layers to CUDA/CPU devices
    torch_dtype="auto"         # Automatically selects optimal precision (like fp16)
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [3]:
from datasets import load_dataset

# Load the GSM8K dataset from the Hugging Face Hub.
# "gsm8k" is a benchmark dataset of grade school math word problems.
# "main" is the clean version (as opposed to "socratic" with explanations).
# We're loading the "train" split for fine-tuning.
gsm8k = load_dataset("gsm8k", "main", split="train")

# Print the first sample from the dataset to inspect its structure.
# Each item contains a "question" and an "answer" (including the final numeric result).
print(gsm8k[0])



README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}


In [4]:
import json

# Initialize an empty list to store formatted question-answer pairs
# We will convert raw GSM8K data into a format suitable for fine-tuning
formatted_data = []

# Iterate through each sample in the GSM8K dataset
for sample in gsm8k:
    # Extract and clean the question and answer strings
    question = sample["question"].strip()
    answer = sample["answer"].strip()

    # Append the formatted prompt-response pair to the list
    # Format: {"input": "Q: ...", "output": "A: ..."}
    formatted_data.append({
        "input": f"Q: {question}",
        "output": f"A: {answer}"
    })

# Save the formatted dataset to a JSONL file (one JSON object per line)
with open("distilled_gsm8k_local.jsonl", "w") as f:
    for item in formatted_data:
        f.write(json.dumps(item) + "\n")  # Convert dict to JSON string and write it to file

# Confirm completion
print("✅ Saved formatted dataset: distilled_gsm8k_local.jsonl")

✅ Saved formatted dataset: distilled_gsm8k_local.jsonl


In [5]:
# ✅ Set the padding token to the end-of-sequence token.
# This is required because models like Phi-2 may not define a pad token by default.
# Hugging Face will raise an error if padding is requested and no pad_token is set.
tokenizer.pad_token = tokenizer.eos_token

from datasets import load_dataset

# Load the formatted JSONL dataset created earlier (GSM8K distilled).
# "json" indicates the format, and we point to the file path.
# ["train"] extracts the actual data split.
dataset = load_dataset("json", data_files="distilled_gsm8k_local.jsonl")["train"]

# ✅ Tokenization function for Causal Language Modeling (CLM)
# Prepares the input-output pair into a single prompt for token prediction
def tokenize(example):
    # Combine question and answer into one training prompt
    prompt = f"{example['input']}\n{example['output']}"

    # Tokenize the combined prompt, pad/truncate to fixed length
    tokenized = tokenizer(prompt, truncation=True, padding="max_length", max_length=512)

    # For Causal LM, labels should match the input_ids exactly (next-token prediction)
    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized

# Apply the tokenize function to all examples in the dataset
# We also remove the original input/output fields, keeping only what the model needs
tokenized_dataset = dataset.map(tokenize, remove_columns=dataset.column_names)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

In [6]:
from peft import get_peft_model, LoraConfig, TaskType

# Define the LoRA configuration for parameter-efficient fine-tuning
lora_config = LoraConfig(
    r=8,                    # Rank of the low-rank decomposition (controls adapter size)
    lora_alpha=16,          # Scaling factor for the adapted weights
    lora_dropout=0.1,       # Dropout rate applied within LoRA layers to regularize training
    bias="none",            # Don't adapt bias parameters (standard in LoRA setups)
    task_type=TaskType.CAUSAL_LM  # Specifies this is a causal language modeling task
)

# Wrap the base model with the LoRA configuration
# This injects trainable adapter layers into the transformer weights
# allowing only those layers to be updated during fine-tuning
model = get_peft_model(model, lora_config)


In [7]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Define training parameters using Hugging Face's TrainingArguments
training_args = TrainingArguments(
    output_dir="./phi2-lora-gsm8k",         # Directory to save model checkpoints and logs
    per_device_train_batch_size=4,          # Batch size per GPU (adjust based on GPU memory)
    num_train_epochs=5,                     # Number of full passes through the dataset
    logging_steps=10,                       # Log training loss every 10 steps
    save_steps=50,                          # Save model checkpoint every 50 steps
    fp16=True,                              # Use 16-bit floating point precision (for speed & memory efficiency)
)

# Define the data collator to prepare batches during training
# Since this is a causal language modeling task (not masked LM), we set mlm=False
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Initialize the Trainer with all the components
trainer = Trainer(
    model=model,                            # LoRA-wrapped base model
    args=training_args,                     # Training configuration
    train_dataset=tokenized_dataset,        # Tokenized input/output dataset
    data_collator=data_collator,            # Collator that batches and formats input
)


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [8]:
# 🚀 Start training the model using the Trainer API
# This will begin the fine-tuning loop based on the configuration and dataset provided earlier.
# It will use the LoRA adapters to update only a small set of trainable parameters.
trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdarshjoshi[0m ([33mdarshjoshi-pace[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,1.2694
20,1.187
30,1.0464
40,0.9933
50,1.0045
60,0.9291
70,0.8478
80,0.9471
90,0.9221
100,0.8358


TrainOutput(global_step=1869, training_loss=0.8431707492418121, metrics={'train_runtime': 756.9258, 'train_samples_per_second': 9.873, 'train_steps_per_second': 2.469, 'total_flos': 6.101496224022528e+16, 'train_loss': 0.8431707492418121, 'epoch': 1.0})

In [9]:
# 💾 Save the fine-tuned model weights (including LoRA adapter layers) to disk
# This saves everything needed to reload the model later using `from_pretrained`
model.save_pretrained("./phi2-lora-checkpoints")

# 💾 Save the tokenizer used during training
# This ensures consistent tokenization if you reload the model later for inference or evaluation
tokenizer.save_pretrained("./phi2-lora-checkpoints")


('./phi2-lora-checkpoints/tokenizer_config.json',
 './phi2-lora-checkpoints/special_tokens_map.json',
 './phi2-lora-checkpoints/vocab.json',
 './phi2-lora-checkpoints/merges.txt',
 './phi2-lora-checkpoints/added_tokens.json',
 './phi2-lora-checkpoints/tokenizer.json')

In [10]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# 📥 Load a small evaluation subset from GSM8K (10 math word problems)
gsm8k = load_dataset("gsm8k", "main", split="train[:10]")

# 🔧 Load the base model (Phi-2) without any fine-tuning or adapters
base_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    device_map="auto",              # Automatically assign model to available GPU
    torch_dtype=torch.float16       # Use float16 for memory-efficient inference
)

# Load corresponding tokenizer for Phi-2
base_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
base_tokenizer.pad_token = base_tokenizer.eos_token  # Set padding token

# 🔧 Load the fine-tuned model with LoRA adapters applied
from peft import PeftModel

# Start with the same base model
lora_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    device_map="auto",
    torch_dtype=torch.float16
)

# Apply LoRA adapters on top of the base model
lora_model = PeftModel.from_pretrained(lora_model, "./phi2-lora-checkpoints")
lora_model.eval()  # Set to evaluation mode (disables dropout)

# Use the same tokenizer for both base and LoRA models
tokenizer = base_tokenizer

# 🧠 Generation function that accepts a model and a question, returns its generated response
def generate(model, question, max_new_tokens=100):
    prompt = f"Q: {question.strip()}\nA:"  # Build a simple QA prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)  # Tokenize & move to GPU
    with torch.no_grad():  # No gradient computation needed for inference
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            repetition_penalty=1.2,  # Penalize repetitive outputs
            temperature=0.7,         # Introduce some randomness in generation
            top_p=0.9                # Use nucleus sampling for diversity
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)  # Decode response

# 🧪 Loop through each question and compare responses from base vs LoRA model
for i, sample in enumerate(gsm8k):
    q = sample['question']
    print(f"\n🧪 Question {i+1}: {q.strip()}\n")

    # Generate response from base (untrained) model
    base_response = generate(base_model, q)
    print("❌ Base Model Answer:\n", base_response)

    # Generate response from LoRA-fine-tuned model
    lora_response = generate(lora_model, q)
    print("✅ LoRA Model Answer:\n", lora_response)

    print("="*80)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



🧪 Question 1: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?



The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


❌ Base Model Answer:
 Q: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
A: To find the total number of clips Natalia sold in both months, we need to add up the quantities from each month. In April, she sold 40 clips, so that's our starting point. Then, in May, she sold half as many clips, which is 1/2 * 40 = 20 clips. Adding these two amounts together gives us a total of 40 + 20 = 60 clips. Therefore, Natalia sold 60 clips altogether in April and May.




The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


✅ LoRA Model Answer:
 Q: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
A: In May, Natalia sold 48 / 2 = <<48/2=24>>24 clips.
In total, Natalia sold 24 + 48 = <<24+48=72>>72 clips.
#### 72
#### 72
#### 72
#### 72
#### 72
#### 72
#### 72
#### 72
#### 72
#### 72
#### 72
#### 72
#### 72
#### 72
#### 72
#### 72
#### 72
#### 72
#### 72
####

🧪 Question 2: Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?



The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


❌ Base Model Answer:
 Q: Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?
A: To find out how much Weng earned yesterday, we need to calculate her earnings per minute and then multiply it by the number of minutes she worked. Since there are 60 minutes in 1 hour, Weng's hourly rate is 12/60 = 0.2 dollars per minute. Therefore, her total earnings can be calculated as follows: 
Earnings = Hourly Rate * Number of Minutes Worked
Earnings = 0.2 * 50 = 10 dollars.
Therefore, Weng earned


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


✅ LoRA Model Answer:
 Q: Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?
A: First find the number of hours in 50 minutes by dividing it by 60 to get 0.83 hours (50/60 = <<50/60=0.83333>>0.833333).
Then multiply that amount by her hourly rate to find how much she earned yesterday:.833 * 12 = 10 dollars.
#### 10 dollars
#### 10 dollars
#### 10 dollars
#### 10 dollars
#### 10 dollars
#### 10 dollars
#### 10 dollars
#### 10 dollars
#### 10

🧪 Question 3: Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?



The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


❌ Base Model Answer:
 Q: Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?
A: To find out how much more money Betty needs, we first calculate the total amount of money she currently has. Since she has only half of the cost of the wallet, she has 1/2 * $100 = $50. 
Next, we add the additional money given by her parents and grandparents. Her parents gave her $15, so now she has an extra $15. Her grandparents gave her twice as much as her parents, which means they gave her 2 * $15 = $


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


✅ LoRA Model Answer:
 Q: Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?
A: The amount of money Betty's parents gave her was 15*2=$<<15*2=30>>30 dollars.
The total amount of money Betty received from both her parents and grandparents was 30+15 = <<30+15=45>>45 dollars.
Betty still needed 100-45 = 55 dollars to be able to afford the wallet.
#### 55
#### 55
#### 55
#### 55
#### 55
#### 55
#### 55
#### 55
#### 55


🧪 Question 4: Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?



The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


❌ Base Model Answer:
 Q: Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?
A: To find out how many pages Julie needs to read tomorrow, we first need to determine how many pages are left in the book after subtracting the pages she has already read. 
Yesterday, Julie read 12 pages, so there were 120 - 12 = 108 pages left. Today, she read twice as many pages as yesterday, which means she read 2 * 12 = 24 pages. Therefore, there are now 108 - 24 = 84 pages left. 
To calculate how many more pages Julie needs


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


✅ LoRA Model Answer:
 Q: Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?
A: Today, Julie read 2*12=<<2*12=24>>24 pages.
So far, she has read 24+12 = <<24+12=36>>36 pages.
She needs to read another 120 - 36 = 84 pages.
Tomorrow, she will need to read 1/2 * 84 = <<1/2*84=42>>42 pages.
#### 42
#### 42
#### 42
#### 42
#### 42
#### 42
#### 42


🧪 Question 5: James writes a 3-page letter to 2 different friends twice a week.  How many pages does he write a year?



The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


❌ Base Model Answer:
 Q: James writes a 3-page letter to 2 different friends twice a week.  How many pages does he write a year?
A: First, calculate the number of letters written in one week by multiplying the number of weeks (52) with the frequency per week (2). This gives us 104 letters. Then multiply this result by the number of pages each letter has which is 3. The total comes out as 312 pages for one week. Multiply that by 52 and you get 15,912 pages annually! 




The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


✅ LoRA Model Answer:
 Q: James writes a 3-page letter to 2 different friends twice a week.  How many pages does he write a year?
A: He writes 6 letters per week because 2 x 2 = <<2*2=4>>4 and 4 + 1 (to himself) = 5
He writes 18 letters in total each month because 6 x 3 = <<6*3=18>>18
So, he writes 216 letters every year because 12 months / 1/12 of the way through the year = <<12/(1/12)=144>>144 years * 18 letters = <<216=216>>216 letters
#### 216


🧪 Question 6: Mark has a garden with flowers. He planted plants of three different colors in it. Ten of them are yellow, and there are 80% more of those in purple. There are only 25% as many green flowers as there are yellow and purple flowers. How many flowers does Mark have in his garden?



The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


❌ Base Model Answer:
 Q: Mark has a garden with flowers. He planted plants of three different colors in it. Ten of them are yellow, and there are 80% more of those in purple. There are only 25% as many green flowers as there are yellow and purple flowers. How many flowers does Mark have in his garden?
A: To find the total number of flowers in Mark's garden, we need to calculate the number of each color separately and then add them together. 
First, let's determine the number of purple flowers. Since there are 80% more purple flowers than yellow ones, we can multiply 10 (the number of yellow flowers) by 1.8 (80% expressed as a decimal). This gives us 18 purple flowers.
Next, let's calculate the number of green flowers. We know that there are


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


✅ LoRA Model Answer:
 Q: Mark has a garden with flowers. He planted plants of three different colors in it. Ten of them are yellow, and there are 80% more of those in purple. There are only 25% as many green flowers as there are yellow and purple flowers. How many flowers does Mark have in his garden?
A: The number of purple flowers is 10 x 1.8 = <<10*1.8=18>>18.
The total number of yellow and purple flowers is 18 + 10 = <<18+10=28>>28.
There are 28 / 0.25 = <<28/0.25=112>>112 green flowers.
In all, the number of flowers that Mark has in his garden is 112 + 28 + 10 = <<112+28+10=150>>150.

🧪 Question 7: Albert is wondering how much pizza he can eat in one day. He buys 2 large pizzas and 2 small pizzas. A large pizza has 16 slices and a small pizza has 8 slices. If he eats it all, how many pieces does he eat that day?



The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


❌ Base Model Answer:
 Q: Albert is wondering how much pizza he can eat in one day. He buys 2 large pizzas and 2 small pizzas. A large pizza has 16 slices and a small pizza has 8 slices. If he eats it all, how many pieces does he eat that day?
A: To find the total number of pieces Albert eats, we need to add up the number of slices from each type of pizza. We have:

2 x 16 + 2 x 8 = 32 + 16 = 48

Albert eats 48 pieces of pizza in one day.



The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


✅ LoRA Model Answer:
 Q: Albert is wondering how much pizza he can eat in one day. He buys 2 large pizzas and 2 small pizzas. A large pizza has 16 slices and a small pizza has 8 slices. If he eats it all, how many pieces does he eat that day?
A: The total number of slices from the two large pizzas is 2 * 16 = <<2*16=32>>32 slices.
The total number of slices from both types of pizzas is 32 + (8 * 2) = <<32+(8*2)=48>>48 slices.
#### 48
#### 48
#### 48
#### 48
#### 48
#### 48
#### 48
#### 48
#### 48
#### 48
#### 48
#### 48
#### 48


🧪 Question 8: Ken created a care package to send to his brother, who was away at boarding school.  Ken placed a box on a scale, and then he poured into the box enough jelly beans to bring the weight to 2 pounds.  Then, he added enough brownies to cause the weight to triple.  Next, he added another 2 pounds of jelly beans.  And finally, he added enough gummy worms to double the weight once again.  What was the final weight of the box of goodies, in pounds?



The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


❌ Base Model Answer:
 Q: Ken created a care package to send to his brother, who was away at boarding school.  Ken placed a box on a scale, and then he poured into the box enough jelly beans to bring the weight to 2 pounds.  Then, he added enough brownies to cause the weight to triple.  Next, he added another 2 pounds of jelly beans.  And finally, he added enough gummy worms to double the weight once again.  What was the final weight of the box of goodies, in pounds?
A: The final weight is 6 pounds.


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


✅ LoRA Model Answer:
 Q: Ken created a care package to send to his brother, who was away at boarding school.  Ken placed a box on a scale, and then he poured into the box enough jelly beans to bring the weight to 2 pounds.  Then, he added enough brownies to cause the weight to triple.  Next, he added another 2 pounds of jelly beans.  And finally, he added enough gummy worms to double the weight once again.  What was the final weight of the box of goodies, in pounds?
A: The first step is to find how many pounds each jelly bean weighs by dividing the total number of jelly beans (2) by their combined weight before adding them all together which equals 1/1 = <<2=2>>2 pounds per jellybean.
The second step is to multiply that amount by 3 because there were three times as much after adding the brownie's weight for a new total of 6 pounds.
That means the original weight of the jellybeans alone was 6-4 = <<

🧪 Question 9: Alexis is applying for a new job and bought a new set of business clothes

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


❌ Base Model Answer:
 Q: Alexis is applying for a new job and bought a new set of business clothes to wear to the interview. She went to a department store with a budget of $200 and spent $30 on a button-up shirt, $46 on suit pants, $38 on a suit coat, $11 on socks, and $18 on a belt. She also purchased a pair of shoes, but lost the receipt for them. She has $16 left from her budget. How much did Alexis pay for the shoes?
A: To find out how much Alexis paid for the shoes, we need to subtract the total amount she already spent from her initial budget of $200. 
$200 - ($30 + $46 + $38 + $11 + $18) = $200 - $129 = $71
Therefore, Alexis paid $71 for the shoes.




The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


✅ LoRA Model Answer:
 Q: Alexis is applying for a new job and bought a new set of business clothes to wear to the interview. She went to a department store with a budget of $200 and spent $30 on a button-up shirt, $46 on suit pants, $38 on a suit coat, $11 on socks, and $18 on a belt. She also purchased a pair of shoes, but lost the receipt for them. She has $16 left from her budget. How much did Alexis pay for the shoes?
A: The total amount she spent was 30+46+38+11+18 = <<30+46+38+11+18=135>>135 dollars.
She had 200 - 135 = 65 dollars remaining in her budget after buying all those items.
The cost of the shoes must have been 65 + 16 = <<65+16=81>>81 dollars.
#### 81
#### 81
#### 81
#### 81
#### 81
#### 81
#### 81
#### 81
####

🧪 Question 10: Tina makes $18.00 an hour.  If she works more than 8 hours per shift, she is eligible for overtime, which is paid by your hourly wage + 1/2 your hourly wage.  If she works 10 hours every day for 5 days, how much money does she make?



The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


❌ Base Model Answer:
 Q: Tina makes $18.00 an hour.  If she works more than 8 hours per shift, she is eligible for overtime, which is paid by your hourly wage + 1/2 your hourly wage.  If she works 10 hours every day for 5 days, how much money does she make?
A: First find out her regular pay for the weekdays (5 days): 

    $18.00 x 8 = $144.00

Then calculate her overtime pay for those five days:

    ($18.00 + ($18.00 / 2)) x 2 = $54.00

Finally add these two amounts together to get her total weekly earnings:

    $144.00 + $54.00 = $198.00

✅ LoRA Model Answer:
 Q: Tina makes $18.00 an hour.  If she works more than 8 hours per shift, she is eligible for overtime, which is paid by your hourly wage + 1/2 your hourly wage.  If she works 10 hours every day for 5 days, how much money does she make?
A: She earns 18*8=$<<18*8=144>>144 a week working regular hours.
She gets 2*(10-8)=4 extra hours of pay each day because she worked over 8 hours so that's 4*5 = <<4*5=20>>20 dollars in overtime

In [21]:
!lm_eval --model hf \
  --model_args pretrained=./phi2-lora-checkpoints \
  --tasks gsm8k,hellaswag,arc_easy \
  --device cuda \
  --limit 500 \
  --output_path results.json


2025-06-03 23:40:31.944761: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748994031.965796   20818 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748994031.972179   20818 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO:lm_eval.__main__:Selected Tasks: ['arc_easy', 'gsm8k', 'hellaswag']
INFO:lm_eval.evaluator:Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
INFO:lm_eval.evaluator:Initializing hf model, with arguments: {'pretrained': './phi2-lora-checkpoints'}
INFO:lm_eval.models.huggingface:Using device 'cuda'
INFO:lm_eval.models.huggingface:Model parallel was set to

In [25]:
import json

# 📂 Load the evaluation results JSON file
with open("/content/results.json/.__phi2-lora-checkpoints/results_2025-06-04T01-19-33.853506.json") as f:
    data = json.load(f)

# 📊 Helper function to clean and format metrics
def print_metrics(task_name, metrics):
    print(f"\n📊 Evaluation Results for: {task_name.upper()}")
    for metric, value in metrics.items():
        if "stderr" in metric:
            continue  # skip stderr; we'll handle it alongside main metric

        # Skip non-numeric values (e.g., alias names)
        try:
            value = float(value)
        except (ValueError, TypeError):
            continue

        stderr_key = metric.replace(",", "_stderr")
        stderr = metrics.get(stderr_key)
        try:
            stderr = float(stderr) if stderr is not None else None
        except (ValueError, TypeError):
            stderr = None

        label = metric.replace(",", " ").replace("_", " ").title()

        if stderr is not None:
            print(f"• {label}: {value:.2f}% ± {stderr:.2f}%")
        else:
            print(f"• {label}: {value:.2f}%")

# ✅ GSM8K
gsm8k_metrics = data["results"]["gsm8k"]
print_metrics("gsm8k", gsm8k_metrics)
print(f"• Samples Evaluated: {int(data['n-samples']['gsm8k']['effective'])}")

# ✅ ARC-Easy
arc_metrics = data["results"]["arc_easy"]
print_metrics("arc_easy", arc_metrics)
print(f"• Samples Evaluated: {int(data['n-samples']['arc_easy']['effective'])}")

# ✅ HellaSwag
hellaswag_metrics = data["results"]["hellaswag"]
print_metrics("hellaswag", hellaswag_metrics)
print(f"• Samples Evaluated: {int(data['n-samples']['hellaswag']['effective'])}")


📊 Evaluation Results for: GSM8K
• Exact Match Strict-Match: 0.55%
• Exact Match Flexible-Extract: 0.54%
• Samples Evaluated: 500

📊 Evaluation Results for: ARC_EASY
• Acc None: 0.79%
• Acc Norm None: 0.79%
• Samples Evaluated: 500

📊 Evaluation Results for: HELLASWAG
• Acc None: 0.50%
• Acc Norm None: 0.61%
• Samples Evaluated: 500


In [26]:
pip install -q huggingface_hub

In [27]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [28]:
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load base Phi-2 model
base_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2")

# Load your fine-tuned LoRA adapter
model = PeftModel.from_pretrained(base_model, "./phi2-lora-checkpoints")

# Load tokenizer (copied from base model)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [29]:
repo_id = "darshjoshi16/phi2-lora-math"
# Push model and tokenizer
model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)

Uploading...:   0%|          | 0.00/36.7M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/darshjoshi16/phi2-lora-math/commit/51bc3eac4a96b9ae1bcce6a723195f8aef7d24ba', commit_message='Upload tokenizer', commit_description='', oid='51bc3eac4a96b9ae1bcce6a723195f8aef7d24ba', pr_url=None, repo_url=RepoUrl('https://huggingface.co/darshjoshi16/phi2-lora-math', endpoint='https://huggingface.co', repo_type='model', repo_id='darshjoshi16/phi2-lora-math'), pr_revision=None, pr_num=None)