## Llama

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    logging,
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import pandas as pd

# --------------------------------------
# Load Dataset and Perform Grouped Split
# --------------------------------------

# Convert to pandas for grouping
df = pd.read_csv("./data/mrbench_v3_devset_train_data.csv").dropna(subset=["response"])
df["text"] = "Conversation:\n" + df["conversation_history"] + "\n\nResponse:\n" + df["response"] + "\n\n\nPrediction: "

# Split by conversation_id
conversation_ids = df["conversation_id"].unique()
train_ids, test_ids = train_test_split(conversation_ids, test_size=0.2, random_state=42)

train_df = df[df["conversation_id"].isin(train_ids)].reset_index(drop=True)
test_df = df[df["conversation_id"].isin(test_ids)].reset_index(drop=True)
print(train_df.head())

# Convert back to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Optional: Create a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# -----------------------
# Tokenizer and Model Setup
# -----------------------
model_name = "meta-llama/Llama-2-7b-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Quantization config
compute_dtype = getattr(torch, "float16")
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto"
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# -----------------------
# PEFT Configuration (LoRA)
# -----------------------
lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.5,
    bias="none",
    task_type="CAUSAL_LM",
)

peft_model = get_peft_model(model, lora_config)

# -----------------------
# Fine-Tuning
# -----------------------
training_args = SFTConfig(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    fp16=True,
    save_total_limit=2,
    logging_steps=100,
    save_steps=500,
    report_to="none",
    dataset_text_field="text",
    max_seq_length=512,
    packing=False,
)

trainer = SFTTrainer(
    model=peft_model,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    processing_class=tokenizer,
    args=training_args,
    peft_config=lora_config,
)

torch.cuda.empty_cache()

trainer.train()

# -----------------------
# Save the Fine-Tuned Model
# -----------------------
peft_model.save_pretrained("./fine-tuned-llama2")
tokenizer.save_pretrained("./fine-tuned-llama2")

## Inference

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    logging,
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import pandas as pd
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
os.environ["PYTORCH_CUDA_ALLOC_CONF"]="expandable_segments:True"

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    correct = sum(p.strip() == l.strip() for p, l in zip(decoded_preds, decoded_labels))
    accuracy = correct / len(decoded_labels)
    
    return {"accuracy": accuracy}


# Convert to pandas for grouping
df = pd.read_csv("./data/mrbench_v3_devset_train_data.csv").dropna(subset=["response"])
df["text"] = "Conversation:\n" + df["conversation_history"] + "\n\nResponse:\n" + df["response"] + "\n\n\nPrediction: "

# Split by conversation_id
conversation_ids = df["conversation_id"].unique()
train_ids, test_ids = train_test_split(conversation_ids, test_size=0.2, random_state=42)

train_df = df[df["conversation_id"].isin(train_ids)].reset_index(drop=True)
test_df = df[df["conversation_id"].isin(test_ids)].reset_index(drop=True)
print(train_df.head())

# Convert back to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


# Optional: Create a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("./fine-tuned-llama2", use_fast=True)

# Load base model — must match original base!
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16, device_map="auto")

# Load PEFT model (with LoRA weights applied)
model = PeftModel.from_pretrained(base_model, "./fine-tuned-llama2")

# -----------------------
# PEFT Configuration (LoRA)
# -----------------------
lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.5,
    bias="none",
    task_type="CAUSAL_LM",
)

peft_model = get_peft_model(model, lora_config)

# -----------------------
# Fine-Tuning
# -----------------------
training_args = SFTConfig(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    fp16=True,
    save_total_limit=2,
    logging_steps=100,
    save_steps=500,
    report_to="none",
    dataset_text_field="text",
    max_seq_length=512,
    packing=False,
)

trainer = SFTTrainer(
    model=peft_model,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    processing_class=tokenizer,
    args=training_args,
    peft_config=lora_config,
    compute_metrics=compute_metrics,
)

torch.cuda.empty_cache()
results = trainer.evaluate()

In [None]:
from sklearn.metrics import classification_report
torch.cuda.empty_cache()
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized_test_dataset = dataset_dict["test"].map(tokenize, batched=True)

# Evaluate the model
predictions = trainer.predict(tokenized_test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids

# Generate detailed classification report (using integer labels)
print("Detailed Classification Report (Integer Labels):")
print(classification_report(true_labels, preds, digits=4))

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

# Load tokenizer and base model
tokenizer = AutoTokenizer.from_pretrained("./fine-tuned-llama2", use_fast=True)
base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",  # or whatever base model you originally used
    torch_dtype=torch.float16,
    device_map="auto"
)

# Apply LoRA fine-tuned weights
model = PeftModel.from_pretrained(base_model, "./fine-tuned-llama2")
model.eval()

def generate_response(prompt, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.7,
            top_p=0.9,
            num_return_sequences=1
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage:
prompt = "Conversation:\nTutor: Hi, could you please provide a step-by-step solution for the question below? The question is: Tyson decided to make muffaletta sandwiches for the big game.\xa0\xa0Each sandwich required 1 pound each of meat and cheese and would serve 4 people.\xa0\xa0There would be 20 people in total watching the game.\xa0\xa0The meat cost $7.00 per pound and the cheese cost $3.00 per pound.\xa0\xa0How much money would he spend on the meat and cheese to make enough sandwiches to serve 20 people?\xa0\n\xa0Student: To serve 20 people, Tyson needs to make 20/4 = 5 sandwiches.\nEach sandwich requires 1+1 = 2 pounds of meat and cheese.\nFor 5 sandwiches, he needs a total of 2 x 5 = 10 pounds of meat and cheese.\nThe cost of 10 pounds of meat is 10 x $7.00 = $70.\nThe cost of 10 pounds of cheese is 10 x $3.00 = $30.\nThe total cost of meat and cheese is $70 + $30 = $100.\n\xa0100\xa0\n\xa0Tutor: How many pounds of meat are needed for each sandwich?\xa0\n\xa0Student: Each sandwich requires 1 pound of meat and 1 pound of cheese.\xa0\n\xa0Tutor: What is the cost of 1 pound of meat?\xa0\n\xa0Student: The cost of 1 pound of meat is $7.00.\n\nResponse:\nGreat, you've correctly identified the cost of the meat, now let's focus on calculating the total cost of meat for all the sandwiches needed.\n\n\nPrediction: "
print(generate_response(prompt))


In [None]:
import pandas as pd
df = pd.read_csv("./data/mrbench_v3_devset_train_data.csv").dropna(subset=["response"])
df["text"] = "Conversation:\n" + df["conversation_history"] + "\n\nResponse:\n" + df["response"] + "\n\n\nPrediction: "
df["text"][0]