In [None]:
! pip install -q evaluate torch tqdm datasets peft transformers rouge_score
! pip install -U bitsandbytes

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


In [None]:
import os
import yaml
import json
import torch
import evaluate
from tqdm import tqdm
from datasets import load_dataset, load_from_disk
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline


In [None]:
DATASETS_DIR = "./datasets"

In [None]:
# Model configuration registry (matching Data_Preprocessing_final.ipynb)
MODEL_CONFIGS = {
    "llama": {
        "path": "meta-llama/Llama-3.2-1B-Instruct",
        "supports_system": True,
        "system_message": "You will generate one cooking recipe. List all necessary ingredients and give detailed steps.",
        "user_message_template": "Include ingredients: {ner}",
        "include_title_in_user": False,
    },
    "mistral": {
        "path": "mistralai/Mistral-7B-Instruct-v0.3",
        "supports_system": False,
        "system_message": "You will generate one cooking recipe. List all necessary ingredients and give detailed steps.",
        "user_message_template": "Include ingredients: {ner}",
        "include_title_in_user": False,

    },
    "gemma": {
        "path": "google/gemma-2-9b-it",
        "supports_system": False,
        "system_message": "You will generate one cooking recipe. List all necessary ingredients and give detailed steps.",
        "user_message_template": "Include ingredients: {ner}",
        "include_title_in_user": False,
    },
    "qwen": {
        "path": "Qwen/Qwen2.5-7B-Instruct",
        "supports_system": True,
        "system_message": "You will generate one cooking recipe. List all necessary ingredients and give detailed steps.",
        "user_message_template": "Include ingredients: {ner}",
        "include_title_in_user": False,
    },
    "olmo": {
        "path": "allenai/OLMoE-1B-7B-0924-Instruct",
        "supports_system": False,
        "system_message": "You will generate one cooking recipe. List all necessary ingredients and give detailed steps.",
        "user_message_template": "Include ingredients: {ner}",
        "include_title_in_user": False,
    },
}

def get_model_config_from_path(model_path: str):
    """
    Extract model configuration from full model path.
    
    Args:
        model_path: Full model path (e.g., "meta-llama/Llama-3.2-1B-Instruct")
        
    Returns:
        Dictionary containing model configuration
    """
    model_path_lower = model_path.lower()
    
    if "llama" in model_path_lower:
        return MODEL_CONFIGS["llama"].copy()
    elif "mistral" in model_path_lower:
        return MODEL_CONFIGS["mistral"].copy()
    elif "gemma" in model_path_lower:
        return MODEL_CONFIGS["gemma"].copy()
    elif "qwen" in model_path_lower:
        return MODEL_CONFIGS["qwen"].copy()
    elif "olmo" in model_path_lower:
        return MODEL_CONFIGS["olmo"].copy()
    else:
        # Default to Llama format
        print(f"[WARNING] Unknown model path: {model_path}. Using Llama format as default.")
        return MODEL_CONFIGS["llama"].copy()


In [None]:
def select_subset(dataset, n_samples, seed=42):
    """
    Select a subset of the dataset.
    If n_samples is "all" or None, return the entire dataset.
    Otherwise, sample n_samples examples.
    """
    if n_samples == "all" or n_samples is None:
        return dataset

    if n_samples > len(dataset):
        print(f"[WARNING] Requested {n_samples} samples but only {len(dataset)} available. Using all samples.")
        return dataset

    return dataset.shuffle(seed=seed).select(range(n_samples))


def load_and_prepare_dataset(cfg):
    """
    Load dataset splits according to configuration.
    Ensures the FULL dataset is cached, and subsets are selected per run.
    Supports both new-style ("dataset": {"splits": {...}}) and old-style (top-level keys) configs.
    Filters invalid samples and creates validation split if missing (for recipe datasets).
    """
    # -----------------------------------------------------------------------
    # Extract dataset configuration
    # -----------------------------------------------------------------------
    if "dataset" in cfg:
        cfg_dataset = cfg["dataset"]
        dataset_name = cfg_dataset["name"]
        splits_cfg = cfg_dataset.get("splits", {})
        n_train = splits_cfg.get("train", "all")
        n_val = splits_cfg.get("validation", "all")
        n_test = splits_cfg.get("test", "all")
        seed = cfg_dataset.get("seed", 42)
    elif "datasets" in cfg and isinstance(cfg["datasets"], list):
        cfg_dataset = cfg["datasets"][0]
        dataset_name = cfg_dataset["path"]
        n_train = cfg.get("train_samples", "all")
        n_val = cfg.get("val_samples", "all")
        n_test = cfg.get("test_samples", "all")
        seed = cfg.get("seed", 42)
    else:
        raise KeyError("Dataset configuration not found. Expected 'dataset' or 'datasets' key.")

    # -----------------------------------------------------------------------
    # Load or download full dataset
    # -----------------------------------------------------------------------
    os.makedirs(DATASETS_DIR, exist_ok=True)
    local_path = os.path.join(DATASETS_DIR, dataset_name.replace("/", "_"))

    if os.path.exists(local_path):
        print(f"[INFO] Loading dataset from local cache: {local_path}")
        dataset = load_from_disk(local_path)
    else:
        print(f"[INFO] Downloading dataset from Hugging Face: {dataset_name}")
        dataset = load_dataset(dataset_name)
        dataset.save_to_disk(local_path)
        print(f"[INFO] Full dataset saved locally to: {local_path}")

    # -----------------------------------------------------------------------
    # Filter invalid samples (required for recipe datasets)
    # -----------------------------------------------------------------------
    def is_valid(sample):
        """Check if sample has all required fields."""
        return (
            sample.get('title') is not None and str(sample.get('title', '')).strip() and
            sample.get('ingredients') is not None and str(sample.get('ingredients', '')).strip() and
            sample.get('directions') is not None and str(sample.get('directions', '')).strip() and
            sample.get('prompt') is not None and str(sample.get('prompt', '')).strip() and
            '[INST]' in str(sample.get('prompt', '')) and '[/INST]' in str(sample.get('prompt', ''))
        )

    print("\n[INFO] Filtering Invalid Samples:")
    for split_name in dataset.keys():
        original_size = len(dataset[split_name])
        dataset[split_name] = dataset[split_name].filter(is_valid)
        new_size = len(dataset[split_name])
        removed = original_size - new_size
        print(f"  {split_name}: kept {new_size:,} / {original_size:,} (removed {removed:,})")

    # -----------------------------------------------------------------------
    # Create validation split from training data (if it doesn't exist)
    # -----------------------------------------------------------------------
    if "validation" not in dataset and "val" not in dataset:
        val_size = cfg_dataset.get("val_size", 0.05)
        print(f"\n[INFO] Creating Validation Split ({val_size*100:.1f}% of train)")
        train_val_split = dataset['train'].train_test_split(
            test_size=val_size,
            seed=seed
        )
        dataset['train'] = train_val_split['train']
        dataset['validation'] = train_val_split['test']
        print(f"[INFO] Created validation split: {len(dataset['validation']):,} samples")

    # -----------------------------------------------------------------------
    # Handle variations in split keys and select subsets dynamically
    # -----------------------------------------------------------------------
    val_key = "validation" if "validation" in dataset else "val"

    train = select_subset(dataset["train"], n_train, seed=seed)
    val = select_subset(dataset[val_key], n_val, seed=seed)
    test = select_subset(dataset["test"], n_test, seed=seed)

    print(f"\n[INFO] Loaded {len(train)} train / {len(val)} val / {len(test)} test samples (from full cache).")
    return train, val, test


In [None]:
def setup_model_and_tokenizer(cfg, use_4bit: bool = None, use_lora: bool = None):
    """
    Load model, tokenizer, and apply quantization + LoRA config if specified.

    Args:
        cfg (dict): Configuration dictionary containing:
            - base_model
            - quantization parameters
            - lora parameters (optional)
            - bf16 or fp16 precision
        use_4bit (bool, optional): Override whether to load in 4-bit mode.
        use_lora (bool, optional): Override whether to apply LoRA adapters.

    Returns:
        tuple: (model, tokenizer)
    """
    model_name = cfg["base_model"]
    print(f"\nLoading model: {model_name}")

    # ------------------------------
    # Tokenizer setup
    # ------------------------------
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"

    # Determine quantization + LoRA usage
    load_in_4bit = use_4bit if use_4bit is not None else cfg.get("load_in_4bit", False)
    apply_lora = use_lora if use_lora is not None else ("lora_r" in cfg)

    # ------------------------------
    # Quantization setup (optional)
    # ------------------------------
    quant_cfg = None
    if load_in_4bit:
        print("[INFO] Enabling 4-bit quantization...")
        quant_cfg = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type=cfg.get("bnb_4bit_quant_type", "nf4"),
            bnb_4bit_use_double_quant=cfg.get("bnb_4bit_use_double_quant", True),
            bnb_4bit_compute_dtype=getattr(
                torch, cfg.get("bnb_4bit_compute_dtype", "bfloat16")
            ),
        )
    else:
        print("[INFO] Loading model in full precision (no quantization).")

    # ------------------------------
    # Model loading
    # ------------------------------
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quant_cfg,
        device_map="auto",
        dtype=(
            torch.bfloat16
            if cfg.get("bf16", True) and torch.cuda.is_available()
            else torch.float32
        ),
    )

    # ------------------------------
    # LoRA setup (optional)
    # ------------------------------
    if apply_lora:
        print("[INFO] Applying LoRA configuration...")
        model = prepare_model_for_kbit_training(model)
        lora_cfg = LoraConfig(
            r=cfg.get("lora_r", 8),
            lora_alpha=cfg.get("lora_alpha", 16),
            target_modules=cfg.get("target_modules", ["q_proj", "v_proj"]),
            lora_dropout=cfg.get("lora_dropout", 0.05),
            bias="none",
            task_type="CAUSAL_LM",
        )
        model = get_peft_model(model, lora_cfg)
        model.print_trainable_parameters()
    else:
        print("üîπ Skipping LoRA setup ‚Äî using base model only.")

    return model, tokenizer

In [None]:
CONFIG_FILE_PATH = "./config.yaml"

def load_config(config_path: str = CONFIG_FILE_PATH):
    """
    Load and parse a YAML configuration file.

    Args:
        config_path (str): Path to the config file.

    Returns:
        dict: Parsed configuration dictionary.
    """
    with open(config_path, "r", encoding="utf-8") as f:
        cfg = yaml.safe_load(f)
    return cfg


In [None]:
def generate_predictions(
    model,
    tokenizer,
    dataset,
    task_instruction,
    cfg=None,
    num_samples=None,
    batch_size=8,
    max_new_tokens=256,
):
    """
    Generate model predictions for a dataset (recipe directions).
    Uses the same message format as the preprocessing notebook.

    Args:
        model: The loaded model (base or fine-tuned).
        tokenizer: Corresponding tokenizer.
        dataset: Hugging Face dataset split containing recipe fields (NER, title, ingredients, directions).
        task_instruction (str): Instruction prefix (kept for compatibility, not used directly).
        cfg (dict, optional): Configuration dictionary to get field_map and base_model.
        num_samples (int, optional): Number of samples to evaluate.
        batch_size (int): Number of examples per inference batch.
        max_new_tokens (int): Max tokens to generate per sample.

    Returns:
        list[str]: Generated recipe responses (full format).
    """
    if num_samples is not None and num_samples < len(dataset):
        dataset = dataset.select(range(num_samples))

    # Get field names from config
    if cfg is not None:
        field_map = cfg.get("dataset", {}).get("field_map", {})
        input_field = field_map.get("input", "NER")  # Default to NER for recipes
        base_model = cfg.get("base_model", "")
    else:
        input_field = "NER"  # Default fallback
        base_model = ""

    # Get model config to determine message format (same as preprocessing)
    model_config = get_model_config_from_path(base_model)

    # Prepare prompts using the same format as preprocessing
    prompts = []
    for sample in dataset:
        messages = []
        
        # Build messages according to model type (same as preprocessing)
        if model_config['supports_system']:
            # Models with system message support: separate system and user
            system_msg = {
                "role": "system",
                "content": model_config['system_message']
            }
            messages.append(system_msg)
            
            # User message with ingredients
            user_content = model_config['user_message_template'].format(ner=sample.get(input_field, ''))
            user_msg = {"role": "user", "content": user_content}
            messages.append(user_msg)

        else:
            # Models without system support: merge system into user message
            user_lines = []
            user_lines.append(model_config['system_message'])
            user_lines.append("")
            
            # Build user message with ingredients only (no title)
            ner = sample.get(input_field, '')
            user_content = model_config['user_message_template'].format(ner=ner)
            user_lines.append(user_content)
            
            user_msg = {
                "role": "user",
                "content": "\n\n".join(user_lines)
            }
            messages.append(user_msg)
        
        # Apply chat template (same as preprocessing, with generation prompt)
        prompt = tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
        )
        prompts.append(prompt)

    # Initialize pipeline
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        dtype="auto",
        do_sample=False,
    )

    # Generate predictions
    preds = []
    for i in tqdm(range(0, len(prompts), batch_size), desc="Generating recipes"):
        batch = prompts[i : i + batch_size]
        outputs = pipe(batch, max_new_tokens=max_new_tokens, return_full_text=False)
        preds.extend([o[0]["generated_text"].strip() for o in outputs])

    return preds


def compute_rouge(predictions, samples, cfg=None):
    """
    Compute ROUGE scores between predictions and reference full recipe format.
    Builds full recipe format from dataset fields (matching preprocessing format).

    Args:
        predictions (list[str]): Model-generated outputs (full recipe format).
        samples (datasets.Dataset): Dataset containing recipe fields (title, ingredients, directions).
        cfg (dict, optional): Configuration dictionary (for compatibility).

    Returns:
        dict: ROUGE-1, ROUGE-2, and ROUGE-L scores.
    """
    # Build full recipe format for references (same format as preprocessing)
    references = []
    for sample in samples:
        full_recipe = (
            f"Certainly! Here's a delicious recipe for:\n"
            f"[ {sample.get('title', 'Recipe')} ]\n\n"
            f"[ INGREDIENTS ]\n{sample.get('ingredients', '')}\n\n"
            f"[ DIRECTIONS ]\n{sample.get('directions', '')}"
        )
        references.append(full_recipe)

    rouge = evaluate.load("rouge")
    return rouge.compute(predictions=predictions, references=references)


In [None]:
"""
evaluate_baseline.py
Evaluate the base (unfine-tuned) model on the recipe generation dataset to establish baseline ROUGE scores.
"""

cfg = load_config()

def evaluate_baseline():
    """Run baseline evaluation on the recipe generation dataset using the base model."""

    # Load validation data
    _, val_data, _ = load_and_prepare_dataset(cfg)
    print(f"[INFO] Loaded {len(val_data)} validation samples.")

    # Load model + tokenizer (no quantization or LoRA)
    model, tokenizer = setup_model_and_tokenizer(
        cfg=cfg,
        use_4bit=False,
        use_lora=False,
    )

    # Generate predictions
    preds = generate_predictions(
        model=model,
        tokenizer=tokenizer,
        dataset=val_data,
        task_instruction=cfg["task_instruction"],
        cfg=cfg,
        batch_size=4,
    )

    # Compute ROUGE metrics
    scores = compute_rouge(preds, val_data, cfg=cfg)

    # -----------------------------------------------------------------------
    # Save outputs
    # -----------------------------------------------------------------------
    results = {
        "model_name": cfg["base_model"],
        "num_samples": len(val_data),
        "rouge1": scores["rouge1"],
        "rouge2": scores["rouge2"],
        "rougeL": scores["rougeL"],
    }

    results_path = "eval_results.json"
    preds_path = "predictions.jsonl"

    with open(results_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)

    with open(preds_path, "w", encoding="utf-8") as f:  # Fixed: removed 4 spaces
        for i, pred in enumerate(preds):
            # Build full reference recipe format
            full_reference = (
                f"Certainly! Here's a delicious recipe for:\n"
                f"[ {val_data[i].get('title', 'Recipe')} ]\n\n"
                f"[ INGREDIENTS ]\n{val_data[i].get('ingredients', '')}\n\n"
                f"[ DIRECTIONS ]\n{val_data[i].get('directions', '')}"
            )
            
            json.dump(
                {
                    "title": val_data[i].get("title", ""),
                    "NER": val_data[i].get("NER", ""),
                    "ingredients": val_data[i].get("ingredients", ""),
                    "directions": val_data[i].get("directions", ""),
                    "reference_full": full_reference,
                    "prediction": pred,
                },
                f,
            )
            f.write("\n")
    print(f"\n[INFO] Saved results to: {results_path}")
    print(f"[INFO] Saved predictions to: {preds_path}")

    return scores, preds


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
if __name__ == "__main__":
    print("[INFO] Starting baseline evaluation...")
    rouge_scores, predictions = evaluate_baseline()
    print("\n[INFO] Evaluation complete.")


    print("\n[INFO] Baseline ROUGE Results:")
    print(f"  ROUGE-1: {rouge_scores['rouge1']:.2%}")
    print(f"  ROUGE-2: {rouge_scores['rouge2']:.2%}")
    print(f"  ROUGE-L: {rouge_scores['rougeL']:.2%}")

    print("\nExample prediction:\n")
    print(predictions[0])
    print("\nRouge scores:\n")
    print(rouge_scores)


üöÄ Starting baseline evaluation...
‚¨áÔ∏è  Downloading dataset from Hugging Face: knkarthick/samsum


README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/14731 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/14731 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/818 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/819 [00:00<?, ? examples/s]

‚úÖ Full dataset saved locally to: ./datasets/knkarthick_samsum
üìä Loaded 14731 train / 200 val / 200 test samples (from full cache).
üìä Loaded 200 validation samples.

Loading model: meta-llama/Llama-3.2-1B-Instruct


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

‚öôÔ∏è  Loading model in full precision (no quantization).


config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


üîπ Skipping LoRA setup ‚Äî using base model only.


Generating summaries:  20%|‚ñà‚ñà        | 10/50 [00:48<02:54,  4.36s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Generating summaries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [03:37<00:00,  4.35s/it]


Downloading builder script: 0.00B [00:00, ?B/s]


üíæ Saved results to: eval_results.json
üíæ Saved predictions to: predictions.jsonl

‚úÖ Evaluation complete.

üìà Baseline ROUGE Results:
  ROUGE-1: 35.00%
  ROUGE-2: 10.99%
  ROUGE-L: 26.23%

Example prediction:

Victoria and Magda commiserated about their financial struggles, with Victoria expressing frustration about overspending and Magda jokingly commiserating about her car insurance being paid for the rest of the year.

Rouge scores:

{'rouge1': np.float64(0.3499520689690201), 'rouge2': np.float64(0.10991887222706245), 'rougeL': np.float64(0.2623150873557639), 'rougeLsum': np.float64(0.2626694023550035)}


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls /content


sample_data


In [None]:
from google.colab import files
uploaded = files.upload()

Saving config.yaml to config.yaml


In [None]:
!ls /content


config.yaml  sample_data
