In [None]:
!pip install -q -U bitsandbytes transformers peft accelerate datasets evaluate rouge_score evaluate bert_score

Memory Metrics

In [None]:
!pip install -q -U bitsandbytes transformers peft accelerate datasets evaluate rouge_score evaluate bert_score

import os
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
from huggingface_hub import login

MODEL_NAMES = [
    # [Finetuned model name , base model name]
    ["facebook/opt-125m","facebook/opt-125m"],
    ["clee9/opt125m-full-billsum", "facebook/opt-125m"],
    ["clee9/opt350m-full-billsum", "facebook/opt-350m"],
    ["clee9/opt125m-lora-billsum-1","facebook/opt-125m"],
    ["clee9/opt125m-lora-pruned-wanda-2-4","facebook/opt-125m"],
    ["clee9/opt125m-lora-pruned-wanda-4-8","facebook/opt-125m"],
    ["clee9/opt125m-lora-pruned-wanda-unstructured","facebook/opt-125m"],
    ["saresri/opt125m-lora-pruned-magnitude-2-4", "facebook/opt-125m"],
    ["saresri/opt125m-lora-pruned-sparsegpt-2-4", "facebook/opt-125m"],
    ["facebook/opt-350m", "facebook/opt-350m"],
    ["clee9/opt350m-lora-billsum-1", "facebook/opt-350m"],
    ["clee9/opt350m-lora-pruned-wanda-2-4", "facebook/opt-350m"],
    ["saresri/opt350m-lora-pruned-magnited-2-4", "facebook/opt-350m"],
    ["saresri/opt350m-lora-pruned-sparsegpt-2-4", "facebook/opt-350m"],
]

RESULTS_FILE = "./model_sizes.json"


def get_directory_size(directory):
    """Get total size of all files in a given directory."""
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(directory):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if os.path.isfile(fp):
                total_size += os.path.getsize(fp)
    return total_size

def main():
    all_results = {}

    for model_name, base_model_name in MODEL_NAMES:
        print(f"Processing model: {model_name}")

        # Create a unique cache directory for each model so we can measure its disk usage
        cache_dir = f"./cache_{model_name.replace('/', '_')}"
        os.makedirs(cache_dir, exist_ok=True)

        # Load base model
        if model_name == "clee9/opt125m-full-billsum" or model_name == "clee9/opt350m-full-billsum":
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                cache_dir=cache_dir
            )


        else:

              model = AutoModelForCausalLM.from_pretrained(
                  base_model_name,
                  return_dict=True,
                  load_in_8bit=True,
                  device_map='auto',
                  cache_dir=cache_dir
              )

              # Attempt to load PEFT adapter if this model is different from the base.
              # If it fails, that means it's not a PEFT model; proceed without it.
              if model_name != base_model_name:
                  try:
                      config = PeftConfig.from_pretrained(model_name)
                      model = PeftModel.from_pretrained(model, model_name)
                  except ValueError:
                      # This means no adapter_config.json was found, so it's not a LoRA model.
                      # Just proceed with the base model as is.
                      print(f"No PEFT adapters found for {model_name}. Proceeding without PEFT.")

        # Measure the size on disk
        # Since we've forced caching in a known directory, we just measure that directory.
        model_size_bytes = get_directory_size(cache_dir)
        model_size_mb = model_size_bytes / (1024 * 1024)

        # Record results
        model_key = model_name.split("/")[-1] if "/" in model_name else model_name
        all_results[model_key] = {
            "model_name": model_name,
            "base_model": base_model_name,
            "disk_size_mb": round(model_size_mb, 2)
        }

        print(f"Model {model_name} disk size: {round(model_size_mb, 2)} MB")

    # Save results
    with open(RESULTS_FILE, "w") as f:
        json.dump(all_results, f, indent=4)
    print("All results saved to model_sizes.json")

main()

Other Metrics

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm
import evaluate
import json
import time
from huggingface_hub import notebook_login, login
import os

In [None]:

# Define models to evaluate
MODEL_NAMES = [
    # [Finetuned model name , base model name]
    ["facebook/opt-125m","facebook/opt-125m"],
    ["clee9/opt125m-full-billsum", "facebook/opt-125m"],
    ["clee9/opt350m-full-billsum", "facebook/opt-350m"],
    ["clee9/opt125m-lora-billsum-1","facebook/opt-125m"],
    ["clee9/opt125m-lora-pruned-wanda-2-4","facebook/opt-125m"],
    ["clee9/opt125m-lora-pruned-wanda-4-8","facebook/opt-125m"],
    ["clee9/opt125m-lora-pruned-wanda-unstructured","facebook/opt-125m"],
    ["saresri/opt125m-lora-pruned-magnitude-2-4", "facebook/opt-125m"],
    ["saresri/opt125m-lora-pruned-sparsegpt-2-4", "facebook/opt-125m"],
    ["facebook/opt-350m", "facebook/opt-350m"],
    ["clee9/opt350m-lora-billsum-1", "facebook/opt-350m"],
    ["clee9/opt350m-lora-pruned-wanda-2-4", "facebook/opt-350m"],
    ["saresri/opt350m-lora-pruned-magnited-2-4", "facebook/opt-350m"],
    ["saresri/opt350m-lora-pruned-sparsegpt-2-4", "facebook/opt-350m"],
    #add full path like above
]

RESULTS_FILE = "./results.json"
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")


In [None]:
def preprocess_test(sample, tokenizer):
    # Tokenize, truncate, and decode text
    text_tokens = tokenizer(sample["text"], truncation=True, max_length=1024)
    truncated_text = tokenizer.decode(text_tokens["input_ids"], skip_special_tokens=True)

    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruct: Summarize the text below."
    RESPONSE_KEY = "### Output:"

    blurb = f"\n{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}"
    input_context = f"{truncated_text}"
    response = f"{RESPONSE_KEY}\n"

    parts = [part for part in [blurb, instruction, input_context, response] if part]

    formatted_prompt = "\n\n".join(parts)
    sample["text"] = formatted_prompt

    # Tokenize formatted prompt
    text_tokens = tokenizer(formatted_prompt,  truncation=True, max_length=2048, padding=True)
    # print(text_tokens["input_ids"].shape)
    sample["input_ids"] = text_tokens["input_ids"]
    sample["attention_mask"] = text_tokens["attention_mask"]

    # Tokenize, truncate, and decode summary
    summary_tokens = tokenizer(sample["summary"], truncation=True, max_length=128)
    truncated_summary = tokenizer.decode(summary_tokens["input_ids"], skip_special_tokens=True)
    sample["summary"] = truncated_summary

    # sample["summary_input_ids"] = summary_tokens["input_ids"]
    # sample["summary_attention_mask"] = summary_tokens["attention_mask"]

    return sample

def extract_output_section(res):
    """Extract model output section."""
    if "### Output:\n" in res[0]:
        output = res[0].split("### Output:\n")[1]
        return output.split("### End")[0] if "### End" in output else output
    return res[0]

def evaluate_model(model, samples, tokenizer, device):
    """Run inference for all samples."""
    model.eval()
    all_predictions, all_labels = [], []

    for sample in tqdm(samples, desc="Evaluating..."):
        input_ids = torch.tensor([sample["input_ids"]]).to(device)
        attention_mask = torch.tensor([sample["attention_mask"]]).to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=128,
            )
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        all_predictions.append(decoded_preds)
        all_labels.append(sample["summary"])

    return all_predictions, all_labels

def compute_metrics(predictions, labels):
    """Compute ROUGE, BERTScore, and model parameters."""

    predictions = [extract_output_section(pred) for pred in predictions]

    rouge_scores = rouge.compute(predictions=predictions, references=labels, use_stemmer=True)
    bert_scores = bertscore.compute(predictions=predictions, references=labels, lang="en")

    avg_bertscore = sum(bert_scores["f1"]) / len(bert_scores["f1"])


    return {
        "rouge": {k: round(v, 4) for k, v in rouge_scores.items()},
        "bertscore_f1": round(avg_bertscore, 4),
    }

def get_model_parameters(model):
    return sum(p.numel() for p in model.parameters())

def get_directory_size(directory):
  """Get total size of all files in a given directory."""
  total_size = 0
  for dirpath, dirnames, filenames in os.walk(directory):
      for f in filenames:
          fp = os.path.join(dirpath, f)
          if os.path.isfile(fp):
              total_size += os.path.getsize(fp)
  return total_size

def main():
    all_results = {}
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Load dataset
    billsum_validation = load_dataset("billsum", split="ca_test").select(range(100))
    billsum_test = load_dataset("billsum", split="test").select(range(100))

    for model_name,base_model_name in MODEL_NAMES:
        print(f"\nEvaluating model: {model_name}")
        start_time = time.time()



        # For the fully finetuned models. We load them directly.
        if model_name == "clee9/opt125m-full-billsum" or model_name == "clee9/opt350m-full-billsum":
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
            )


        else:

              model = AutoModelForCausalLM.from_pretrained(
                  base_model_name,
                  return_dict=True,
                  load_in_8bit=True,
                  device_map='auto',
              )

              # Attempt to load PEFT adapter if this model is different from the base.
              # If it fails, that means it's not a PEFT model; proceed without it.
              if model_name != base_model_name:
                  try:
                      config = PeftConfig.from_pretrained(model_name)
                      model = PeftModel.from_pretrained(model, model_name)
                  except ValueError:
                      # This means no adapter_config.json was found, so it's not a LoRA model.
                      # Just proceed with the base model as is.
                      print(f"No PEFT adapters found for {model_name}. Proceeding without PEFT.")



        tokenizer = AutoTokenizer.from_pretrained(base_model_name,
                                          padding_side="left",
                                          add_eos_token=True,
                                          add_bos_token=True)

        tokenizer.pad_token = tokenizer.eos_token

        model_parameters = get_model_parameters(model)

        # Compute model size on disk
        model_path = model.config._name_or_path
        model_size_bytes = get_directory_size(model_path)
        model_size_mb = model_size_bytes / (1024 * 1024)

        # Preprocess datasets
        billsum_validation_trunc = billsum_validation.map(lambda x: preprocess_test(x, tokenizer), batched=False)
        billsum_test_trunc = billsum_test.map(lambda x: preprocess_test(x, tokenizer), batched=False)

        # Evaluate on validation set
        predictions_val, labels_val = evaluate_model(model, billsum_validation_trunc, tokenizer, device)
        val_metrics = compute_metrics(predictions_val, labels_val)


        # Evaluate on test set
        predictions_test, labels_test = evaluate_model(model, billsum_test_trunc, tokenizer, device)
        test_metrics = compute_metrics(predictions_test, labels_test)

        model_name_main = model_name.split("/")[1]
        # Store results
        all_results[model_name_main] = {
            "validation": val_metrics,
            "test": test_metrics,
            "total_time": round(time.time() - start_time, 2),
            "parameters": model_parameters,
            "model_size_mb": round(model_size_mb, 2)
        }

        print(f"Model {model_name}" + "Results: " + str(all_results[model_name_main]))

        # Save intermediate results
        with open(RESULTS_FILE, "w") as f:
            json.dump(all_results, f, indent=4)

        print(f"Model {model_name} completed. Results stored.")

    print("\nEvaluation complete. Results saved in results.json")

In [None]:
main()

In [None]:
from google.colab import files

files.download('results.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>