## Install necessary libraries if not already installed

In [None]:
!pip install transformers datasets evaluate peft bitsandbytes

## Imports

In [None]:
import torch
import numpy as np
from transformers import LlamaTokenizer, LlamaModel, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, SFTTrainer, TaskType
from bitsandbytes import BitsAndBytesConfig
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## check device

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

## Load Dataset

In [None]:
stsb_dataset = load_dataset("sentence-transformers/stsb", split="train")

## Tokenizer and Model setup

In [None]:
model_name = "meta-llama/Llama-3B"
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaModel.from_pretrained(model_name)

## Define the LoRA fine-tuning configuration

In [None]:
lora_r = 4
lora_alpha = 16
lora_dropout = 0.1
peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    task_type=TaskType.SEQ_CLASSIFICATION  # change as per task
)

In [None]:
# Configuring quantization and bits-and-bytes for efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = model.to(device)

# Define the function for tokenization
def tokenize_batch(batch):
    return tokenizer(batch["sentence1"], batch["sentence2"], padding=True, truncation=True, max_length=128)

# Tokenize the dataset
stsb_dataset = stsb_dataset.map(tokenize_batch, batched=True)

# Define the metric computation function for similarity
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    precision = precision_score(labels, predictions, average="weighted")
    recall = recall_score(labels, predictions, average="weighted")
    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    logging_steps=10,
    load_best_model_at_end=True,
    save_strategy="epoch"
)

# Fine-tuning with Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=stsb_dataset["train"],
    eval_dataset=stsb_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Run training
trainer.train()

# Evaluate on test set
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Saving the fine-tuned model for later comparisons
model.save_pretrained("./fine_tuned_llama")

### compare the performance of the fine-tuned Llama model versus the non-fine-tuned version

In [None]:
# Loading a new instance of the non-fine-tuned model for comparison
non_fine_tuned_model = LlamaModel.from_pretrained(model_name).to(device)

# Loading a sample classification dataset for comparison (using the STSb test set)
stsb_test_dataset = load_dataset("sentence-transformers/stsb", split="test")

# Tokenize the test dataset for evaluation
stsb_test_dataset = stsb_test_dataset.map(tokenize_batch, batched=True)

# Define an evaluation function to get similarity scores from models
def evaluate_model(model, dataset):
    model.eval()
    similarities = []
    with torch.no_grad():
        for batch in dataset:
            inputs = tokenizer(
                batch["sentence1"],
                batch["sentence2"],
                padding=True,
                truncation=True,
                return_tensors="pt",
                max_length=128
            ).to(device)

            outputs = model(**inputs)
            # Assuming that we use the CLS token's hidden state for similarity
            embeddings = outputs.last_hidden_state[:, 0, :]
            similarity_scores = torch.cosine_similarity(
                embeddings[0::2], embeddings[1::2], dim=1
            )
            similarities.extend(similarity_scores.cpu().numpy())

    return similarities

# Evaluate the fine-tuned model
print("Evaluating fine-tuned model...")
fine_tuned_similarities = evaluate_model(model, stsb_test_dataset)

# Evaluate the non-fine-tuned model
print("Evaluating non-fine-tuned model...")
non_fine_tuned_similarities = evaluate_model(non_fine_tuned_model, stsb_test_dataset)

# Compare the results with ground truth similarity scores from the dataset
ground_truth = stsb_test_dataset["similarity_score"]  # Assuming 'similarity_score' is the correct field

# Compute correlation metrics (e.g., Spearman's correlation)
from scipy.stats import spearmanr

fine_tuned_correlation = spearmanr(fine_tuned_similarities, ground_truth).correlation
non_fine_tuned_correlation = spearmanr(non_fine_tuned_similarities, ground_truth).correlation

print(f"Spearman's correlation for the fine-tuned model: {fine_tuned_correlation:.4f}")
print(f"Spearman's correlation for the non-fine-tuned model: {non_fine_tuned_correlation:.4f}")

# Interpretation of results
if fine_tuned_correlation > non_fine_tuned_correlation:
    print("The fine-tuned model shows better alignment with human-rated scores, indicating successful fine-tuning.")
else:
    print("The non-fine-tuned model performs comparably or better, suggesting that further tuning strategies may be needed.")
