In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/fine_tune_llms/llama3.2_python/

from huggingface_hub import login
import json

with open("config.json", "r") as config_file:
    config = json.load(config_file)
    access_token = config["HF_ACCESS_TOKEN"]
login(token=access_token)

In [None]:
!pip install -r requirements.txt
!pip install --upgrade bitsandbytes

## 1. Download test data 

In [None]:
from datasets import load_dataset

# download and saved test data
def download_data():
    # download data
    ds=load_dataset("iamtarun/python_code_instructions_18k_alpaca",streaming=True, split="train")
    ds_test=[]
    # datata set has 18.6k samples, we use 16.8k (90%) for training + 1.8k for validation
    num_samples=16800
    counter=0
    for sample in iter(ds):
        if counter<num_samples:
            counter+=1
            continue
        ds_test.append(sample)

    data_file="data-test.json"
    with open(data_file, "w") as train_f:
        json.dump(ds_test, train_f, indent=4)
    print(f"Training data saved to {data_file}")
    
download_data()

In [11]:
import torch

def format_sample(sample):
    """ Helper function to format a single input sample"""
    instruction=sample['instruction']
    input_text=sample['input']

    if input_text is None or input_text=="":
        formatted_prompt=(
            f"<|start_header_id|>user<|end_header_id|>\n\n"
            f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
            f"### Instruction:\n{instruction}\n\n"
            f"### Response:\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
        )
    else:
        formatted_prompt=(
            f"<|start_header_id|>user<|end_header_id|>\n\n"
            f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
            f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n"
            f"### Response:\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
        )
    formatted_prompt="".join(formatted_prompt) # exclude trailing white spaces
    return formatted_prompt                    # stream text into the dataloader, one by one


def generate_ft(model, sample, tokenizer, max_new_tokens, context_size=256, temperature=0.0, top_k=1, eos_id=[128001, 128009]):
    """
    Generate text using a language model with proper dtype handling
    """
    # Get model's expected dtype and device
    model_dtype = next(model.parameters()).dtype
    model_device = next(model.parameters()).device

    formatted_prompt = format_sample(sample)

    # Encode and prepare input
    idx = tokenizer.encode(formatted_prompt)
    idx = torch.tensor(idx, dtype=torch.long, device=model_device).unsqueeze(0)
    num_tokens = idx.shape[1]

    # Generation loop
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]   # conditioning context
        with torch.no_grad():
            # Forward pass
            outputs = model(input_ids=idx_cond,use_cache=False)
            logits = outputs.logits

        # Focus on last time step
        logits = logits[:, -1, :]

        # Apply top-k filtering
        if top_k is not None and top_k > 0:
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, [-1]]
            logits = torch.where(
                logits < min_val,
                torch.tensor(float('-inf'), device=model_device, dtype=model_dtype),
                logits
            )

        # Apply temperature and sample
        if temperature > 0.0:
            logits = logits / temperature
            probs = torch.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)

        # Check for EOS
        if idx_next.item() in eos_id:
            break

        # Append new token
        idx = torch.cat((idx, idx_next), dim=1)

    # Decode generated text
    generated_ids = idx.squeeze(0)[num_tokens:]
    generated_text = tokenizer.decode(generated_ids)

    return generated_text


## 2. Load fine-tuned model and generate response

In [None]:
from peft import PeftModel, LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

def load_fine_tune_model(base_model_id, saved_weights):
    # Load tokenizer and base model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained(base_model_id)
    base_model = AutoModelForCausalLM.from_pretrained(base_model_id)
    base_model.to(device)

    # Create LoRA config - make sure these parameters match your training configuration
    peft_config = LoraConfig(
        lora_alpha=16,
        r=8,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=['q_proj','k_proj','v_proj'],
    )

    # Initialize PeftModel
    lora_model = PeftModel(base_model, peft_config)

    # Load the saved weights
    state_dict = torch.load(saved_weights,map_location=device)

    # Create new state dict with correct prefixes and structure
    new_state_dict = {}
    for key, value in state_dict.items():
        # key starts with "model"-> add "base_" to the new key for base_model
        new_key = f"base_{key}"
        new_state_dict[new_key] = value

    # Load the weights with strict=False to allow partial loading
    lora_model.load_state_dict(new_state_dict, strict=False)

    # Set to evaluation mode
    lora_model = lora_model.eval()

    return lora_model, tokenizer

# Original model and saved weight
base_model_id = "meta-llama/Llama-3.2-1B-Instruct"
lora_weights = "LLAMA32_ft_python_code.pth"
# Load model
print("Loading fine-tuned model ...")
model_ft, tokenizer = load_fine_tune_model(base_model_id, lora_weights)
total_params=sum(p.numel() for p in model_ft.parameters())
trainable_params=sum(p.numel() for p in model_ft.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

In [None]:
from tqdm import tqdm

# load test data
test_data_path="data-test.json"
with open(test_data_path,"r") as f:
    test_data=json.load(f)

for i,sample in tqdm(enumerate(test_data),total=len(test_data)):
    generated_text=generate_ft(model_ft, sample, tokenizer, max_new_tokens=100)
    test_data[i]["model response"]=generated_text

# write into a file
test_data_path="test-data-with-response.json"

with open(test_data_path,"w") as file:
    json.dump(test_data,file, indent=4)
print(f"Response saved as {test_data_path}")

## 3. Evaluate response with LLAMA3 8B

In [None]:
import urllib.request
import json


def query_model(prompt, model="llama3.2", url="http://localhost:11434/api/chat"):
    # Create the data payload as a dictionary
    data = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "options": {     # Settings below are required for deterministic responses
            "seed": 123,
            "temperature": 0,
            "num_ctx": 2048
        }
    }

    # Convert the dictionary to a JSON formatted string and encode it to bytes
    payload = json.dumps(data).encode("utf-8")

    # Create a request object, setting the method to POST and adding necessary headers
    request = urllib.request.Request(url, data=payload, method="POST")
    request.add_header("Content-Type", "application/json")

    # Send the request and capture the response
    response_data = ""
    with urllib.request.urlopen(request) as response:
        # Read and decode the response
        while True:
            line = response.readline().decode("utf-8")
            if not line:
                break
            response_json = json.loads(line)
            response_data += response_json["message"]["content"]

    return response_data


result = query_model("What do Llamas eat?")
print(result)

In [None]:
# load test entries
json_file = "test-data-with-response.json"

with open(json_file, "r") as file:
    json_data = json.load(file)

print("Number of entries:", len(json_data))
print(f"First entry:")
json_data[0]

In [None]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. Write a response that "
        f"appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
    instruction_text + input_text

    return instruction_text + input_text

for entry in json_data[:3]:
    prompt = (f"Given the input `{format_input(entry)}` "
              f"and correct output `{entry['output']}`, "
              f"score the model response `{entry['model response']}`"
              f" on a scale from 0 to 100, where 100 is the best score. "
              )
    print("\nDataset response:")
    print(">>", entry['output'])
    print("\nModel response:")
    print(">>", entry["model response"])
    print("\nScore:")
    print(">>", query_model(prompt))
    print("\n-------------------------")

In [None]:
import random
from tqdm import tqdm


def generate_model_scores(json_data, json_key, sample_size=None):
    """
    Generates model scores based on a subset or the full dataset.

    Args:
        json_data (list): The input data as a list of dictionaries.
        json_key (str): The key in the dictionary to evaluate.
        sample_size (int, optional): Number of random samples to evaluate. 
                                     If None, all entries in `json_data` are evaluated.

    Returns: A list of scores for the evaluated entries.
    """
    # If sample_size is None, evaluate the full dataset
    if sample_size is None:
        sampled_data = json_data
    else:
        sampled_data = random.sample(json_data, min(sample_size, len(json_data)))

    scores = []
    for entry in tqdm(sampled_data, desc="Scoring entries"):
        prompt = (
            f"Given the input `{format_input(entry)}` "
            f"and correct output `{entry['output']}`, "
            f"score the model response `{entry[json_key]}`"
            f" on a scale from 0 to 100, where 100 is the best score. "
            f"Respond with the integer number only."
        )
        score = query_model(prompt)
        try:
            scores.append(int(score))
        except ValueError:
            continue

    return scores

scores = generate_model_scores(json_data, "model response", sample_size=100)
print(f"Number of scores: {len(scores)} of {len(json_data)}")
print(f"Average score: {sum(scores)/len(scores):.2f}\n")


# # Optionally save the scores
# save_path = Path("scores") / f"llama3.2-1b-model-reponse.json"

# # Create the parent directory if it doesn't exist
# save_path.parent.mkdir(parents=True, exist_ok=True)

# with open(save_path, "w") as file:
#     json.dump(scores, file)
# print(f"Responses saved at {save_path}")