# 1. Data preparation
In this section, the data will first be prepared for model training.

Install required external Python libraries for training and evaluation of the Flan-T5 model.

In [None]:
%pip install evaluate
%pip install -U transformers
%pip install rouge_score
%pip install bert_score

## 1.1 Load necessary libraries and dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
# Set memory optimization environment variables
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

OUTPUT_DIR = "/content/drive/MyDrive/recipe_model/checkpoints_test"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import torch
import evaluate
import ast
import time
from datasets import load_dataset, Dataset
from transformers import DataCollatorForSeq2Seq, T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from google.colab import drive
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model, TaskType, PeftModel, PeftConfig
from tqdm import tqdm
from bert_score import score as bert_score
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.preprocessing import normalize

In [None]:
torch.cuda.empty_cache() # clear GPU memory

df = pd.read_csv(
    "recipes_w_search_terms.csv",
    engine='python',
    encoding='utf-8'
)

# Clean unusual line terminators and strip spaces
df = df.map(
    lambda x: str(x).replace('\u2028', '\n').replace('\u2029', '\n').strip().lower()
    if isinstance(x, str) else x
)

df.info()
df.head()

## 1.2 Data cleaning and preprocessing
The raw dataset has to be cleaned and reformatted into a structure suitable for modelling and text-to-text generation.

Many of the fields in this dataset are stored as string representations of Python lists, hence it needs to be safely parsed into actual lists. After parsing, the data is normalised by converting all text to lowercase and removing unnecessary whitespace. This ensures that the format is consistent and does not introduce noise into the training process. Rows that lack important information like the `ingredients` and `steps` are removed.

Each recipe is transformed into an input-output pair that aligns with the model's objective. The model's input is formed by joining the cleaned ingredient names into a single comma-separated string, while the target output will be a structured text block consisting of the recipe title, the expanded ingredient list, and the instructions. These are stored in the `input_text` and `target_text` columns.

In [None]:
# Safely parse list-like strings
def safe_parse(x):
    try:
        return ast.literal_eval(x)
    except Exception:
        return []

# Normalise whitespace and lowercase
def clean_whitespace(x):
    if isinstance(x, str):
        return re.sub(r'\s+', ' ', x).strip().lower()
    elif isinstance(x, list):
        return [re.sub(r'\s+', ' ', i).strip().lower() for i in x if isinstance(i, str)]
    return x

# Apply cleaning functions
df["ingredients"] = df["ingredients"].apply(safe_parse).apply(clean_whitespace)
df["ingredients_raw_str"] = df["ingredients_raw_str"].apply(safe_parse).apply(clean_whitespace)
df["steps"] = df["steps"].apply(safe_parse).apply(clean_whitespace)

# Drop incomplete rows
df = df[df["ingredients"].map(len) > 0]
df = df[df["steps"].map(len) > 0]

# Format examples for training
def format_training_example(row):
    title = str(row.get("name", "")).strip().lower()
    ingredients_in = ", ".join(row["ingredients"])
    ingredients_out = "\n".join(row["ingredients_raw_str"])
    steps = "\n".join(row["steps"])
    recipe_text = f"{title}\n\ningredients:\n{ingredients_out}\n\nsteps:\n{steps}"
    return ingredients_in, recipe_text

# Apply formatting
df["input_text"], df["target_text"] = zip(*df.apply(format_training_example, axis=1))

# Drop rows missing either field and reset index
df = df.dropna(subset=["input_text", "target_text"]).reset_index(drop=True)

# Get the validation set
sampling_df = df[80001:]
df = df[:80000]

# Show example
print(df.head(1)[["input_text", "target_text"]].to_string())


## 1.3 Split data
After cleaning and formatting the dataset, it is first split into train and test sets using a 90/10 split, with a fixed random seed to ensure reproducibility.

As we are training a Hugging Face transformer model, we need a validation set as well. However, we did not include that in our LSTM model training. As such, we created a separate validation set by sampling from the previously reserved `sampling_df`, providing an independent subset for tuning hyperparameters and monitoring model performance during training. \
This results in a train, validation, and test datasets with a split of 80/10/10.

Each of these subsets is then converted into Hugging Face `Dataset` objects, allowing efficient integration with the `Seq2SeqTrainer` and other utilities.

In [None]:
# Create hugging face dataset
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
val_df = sampling_df.sample(n=8000, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# 2. Model building and training (FLAN)

## 2.1 Tokenisation and input preparation
Different from the steps taken when training the LSTM model, the Flan-T5 transformer model is tokenised after splitting and before model training. This is due to the two models using different training paradigms.
- LSTM requires the data to be tokenised before splitting as the data structure (X, Y) depends on it.
- Flan-T5 tokenisation does no depend on X/Y shifting, so it happens after splitting of dataset to prevent data leakage.
- X = tokens[0:n-1], Y = tokens[1:n]

After splitting the dataset into training, validation, and test datasets, the raw text fields are converted into token IDs that the Flan-T5 model can process.

The pretrained Flan-T5 tokeniser and model are loaded, and the maximum sequence lengths for both the input and the target output are defined. The ingredient list is prefixed with an instruction prompt so that the model understands the task and is consistent with its generation behaviour. Both inputs and target recipe texts are tokenised using fixed maximum lengths, with truncation and padding applied to maintain uniform tensor shapes.

Padding tokens are replaced with `-100` so that they are ignored during loss computation (standard practice for seq2seq models). The preprocessing function is applied to all datasets, producing tokenised datasets that are ready for training.

`DataCollatorForSeq2Seq` dynamically batches and pad examples during training in a way that is compatible with Flan-T5's encoder-decoder architecture.


In [None]:
# model_name = "google/flan-t5-base"
model_name = "google/flan-t5-small"

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

max_input = 128
max_target = 256

def preprocess_function(examples):
    inputs = ["Generate a recipe with measurements using: " + ing for ing in examples["input_text"]]
    model_inputs = tokenizer(
        inputs,
        max_length=max_input,
        truncation=True,
        padding="max_length",
    )

    # Tokenize targets (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target_text"],
            max_length=max_target,
            truncation=True,
            padding="max_length",
        )

    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

## 2.2 LoRA configuration (parameter-efficient fine-tuning (PEFT))
Low-Rank Adaptation (LoRA) is applied to fine-tune and train Flan-T5 efficiently. This allows the model to adapt to the recipe-generation task while keeping most of the pretrained weights frozen, resulting in significantly lower memory usage and faster training.

LoRA hyperparameters are defined to balance training stability and model quality. Target modules corresponding to the key projection layers within the T5 attention blocks where LoRA adaptors should be injected are also specified.

The base model is then wrapped with the LoRA adaptors using `get_peft_model`.

In [None]:
# LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q", "v", "k", "o"], # for more comprehensive targeting
    bias="none",
)

# Wrap base model with LoRA adaptors
model = get_peft_model(model, lora_config)

# Verify that PEFT is correctly applied
model.print_trainable_parameters()

## 2.3 Training configuration and hyperparameters
The training hyperparameters are defined in this section.

In [None]:
batch_size = 4

args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    eval_steps=2000,
    save_strategy="epoch",
    save_steps=2000,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    warmup_steps=500,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=100,
    fp16=False,
    bf16=torch.cuda.is_bf16_supported(),
    report_to="none",
    max_grad_norm=1.0,
    logging_dir="./logs",
    load_best_model_at_end=True,
    predict_with_generate=True,
    dataloader_num_workers=1,
    optim="adamw_torch_fused",
    dataloader_pin_memory=False,
)

In [None]:
# Check memory status
print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
print(f"GPU memory free: {torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated() / 1024**3:.2f} GB")

## 2.4 Initialise the Trainer
This section initialises a Hugging Face `Seq2SeqTrainer`, which provides a high-level training loop tailored for encoder-decoder models like Flan-T5.

It handles batching, forward and backward passes, gradient updates, evaluation, and checkpointing, simplifying the fine-tuning workflow.

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

## 2.5 Train model
The model is trained in this section. During training, the model iteratively updates its LoRA adaptor weigfhts based on the training data while periodically evaluating its performance on the validation set.

The trained model is saved to Google Drive such that it can be reloaded later for evaluation without retraining. The tokeniser is also saved to ensure that preprocessing and text generation remain consistent across current and future sessions.

Model was trained in separate sessions as T4 GPU time ran out.

In [None]:
# Train model
trainer.train()

# Save model to drive
save_path = "/content/drive/MyDrive/recipe_model/final_model"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print(f"Path model is saved to: {save_path}")

In [None]:
# Continue training from previous saved checkpoint
checkpoints = [d for d in os.listdir(OUTPUT_DIR) if d.startswith("checkpoint")]
print(OUTPUT_DIR)
latest_checkpoint = max(
    checkpoints,
    key=lambda x: int(re.search(r"checkpoint-(\d+)", x).group(1))
)

latest_checkpoint = os.path.join(OUTPUT_DIR, latest_checkpoint)
print(f"Resuming from: {latest_checkpoint}")

In [None]:
# Resume training
trainer.train(resume_from_checkpoint=latest_checkpoint)

# Save model to drive
save_path = "/content/drive/MyDrive/recipe_model/final_model"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print(f"Path model is saved to: {save_path}")

# 3. Model evaluation (FLAN-T5)

## 3.1 Reloading model for evaluation

In [None]:
model_dir = "/content/drive/MyDrive/recipe_model/final_model"

# Load the base model
base_model = T5ForConditionalGeneration.from_pretrained(
    "google/flan-t5-small",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

# Load the LoRA adapter
model = PeftModel.from_pretrained(base_model, model_dir)

# Merge LoRA weights for faster inference
model = model.merge_and_unload()

# Move to GPU and set to eval mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_dir)

## 3.2 Generating sample recipes for qualitative analysis

In [None]:
# Helper function to generate recipe from list of ingredients
def generate_recipe(ingredients, model, tokenizer, max_length=256):
    # Join ingredients
    if isinstance(ingredients, list):
        ingredients_text = ", ".join(ingredients)
    else:
        ingredients_text = ingredients

    input_text = f"Generate a recipe with measurements using: {ingredients_text}"

    # Tokenise
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=128,
        truncation=True
    )

    # Move inputs to same device as model
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=3,
            temperature=0.7,
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test
ingredients = ["chicken breast", "garlic", "olive oil", "lemon", "pepper"]
recipe = generate_recipe(ingredients, model, tokenizer)
print(recipe)

In [None]:
# Testing various ingredients
# Example 1
ingredients = ["onion", "salt", "flour"]
recipe1 = generate_recipe(ingredients, model, tokenizer)
print(recipe1)

# Example 2
ingredients = ["chocolate", "salt"]
recipe2 = generate_recipe(ingredients, model, tokenizer)
print(recipe2)

# Example 3
ingredients = ["potato", "flour", "onion", "garlic", "oil"]
recipe3 = generate_recipe(ingredients, model, tokenizer)
print(recipe3)

# Example 4
ingredients = ["egg", "leek"]
recipe4 = generate_recipe(ingredients, model, tokenizer)
print(recipe4)

## 3.3 Stratified sampling & batch generation
Instead of running inference on the full test set, a smaller representative sample is selected to reduce computation time. Recipe generation is then performed in batches, which allows the model to process multiple inputs simultaneously.

In [None]:
SAMPLE_SIZE = 500  # reduce from 8000 to 500
np.random.seed(42)
sample_indices = np.random.choice(len(test_dataset), size=min(SAMPLE_SIZE, len(test_dataset)), replace=False)

# Batch generation for speed
def generate_batch(input_texts, model, tokenizer, batch_size=8, max_length=256):
    all_predictions = []
    num_batches = (len(input_texts) + batch_size - 1) // batch_size

    with tqdm(total=len(input_texts), desc="Generating recipes", unit="recipe") as pbar:
        for i in range(0, len(input_texts), batch_size):
            batch = input_texts[i:i+batch_size]
            inputs = ["Generate a recipe with measurements using: " + ing for ing in batch]

            # Tokenise batch
            model_inputs = tokenizer(
                inputs,
                return_tensors="pt",
                max_length=128,
                truncation=True,
                padding=True
            )
            model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}

            # Generate for batch
            with torch.no_grad():
                outputs = model.generate(
                    **model_inputs,
                    max_length=max_length,
                    num_beams=4,
                    early_stopping=True,
                    no_repeat_ngram_size=3,
                    temperature=0.7,
                )

            # Decode batch
            batch_predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            all_predictions.extend(batch_predictions)

            # Update progress bar
            pbar.update(len(batch))

            # Show speed estimate every 50 recipes
            if len(all_predictions) % 50 == 0:
                pbar.set_postfix({"completed": f"{len(all_predictions)}/{len(input_texts)}"})

    return all_predictions

# Generate predictions for sampled test set
print("Generating predictions with batching...")
input_texts = [test_dataset[int(i)]["input_text"] for i in sample_indices]
references = [test_dataset[int(i)]["target_text"] for i in sample_indices]

# Batch generation
predictions = generate_batch(input_texts, model, tokenizer, batch_size=8)
print(f"Generated {len(predictions)} predictions")

## 3.4 Run evaluation

In [None]:
# BERTScore (semantic similarity)
P, R, F1 = bert_score(predictions, references, lang="en", verbose=True)
print(f"  Precision: {P.mean():.4f}")
print(f"  Recall: {R.mean():.4f}")
print(f"  F1: {F1.mean():.4f}")

In [None]:
# Helper function to extract ingredients from recipe text
def extract_ingredients(text):
    if "ingredients:" in text.lower():
        parts = text.lower().split("ingredients:")
        if len(parts) > 1:
            ing_section = parts[1].split("steps:")[0] if "steps:" in parts[1] else parts[1]
            return [line.strip() for line in ing_section.strip().split('\n') if line.strip()]
    return []

# Defining function to calculate ingredient coverage: % of true ingredients that appear in predictions
def ingredient_coverage(pred_ingredients, true_ingredients):
    if not true_ingredients:
        return 0.0
    pred_text = " ".join(pred_ingredients).lower()
    matches = sum(1 for ing in true_ingredients if any(word in pred_text for word in ing.lower().split()))
    return matches / len(true_ingredients)

# Compile average ingredient coverage across generated recipes
ingredient_coverages = []

for pred, ref in zip(predictions, references):
    pred_ings = extract_ingredients(pred)
    ref_ings = extract_ingredients(ref)

    coverage = ingredient_coverage(pred_ings, ref_ings)
    ingredient_coverages.append(coverage)

print(f"\n  Avg Ingredient Coverage: {np.mean(ingredient_coverages):.2%}")

In [None]:
# Generating sample vs true recipes for human evaluation
for i in range(min(30, len(predictions))):
    print(f"\nðŸ”¹ Example {i+1}")
    print(f"Input: {input_texts[i]}")
    print(f"\nGenerated Recipe:\n{predictions[i]}")
    print(f"\nTrue Recipe:\n{references[i]}")
    print("-"*80)

# 4. Ablation Studies (FLAN-T5)



In [None]:
# Sample size for ablation
ABLATION_SAMPLE_SIZE = 100
np.random.seed(42)
sample_indices = np.random.choice(len(test_dataset), size=ABLATION_SAMPLE_SIZE, replace=False)

# Define different prompt formats
prompt_formats = {
    "format_0": "Generate a recipe with measurements using: {ingredients}" # original
    "format_1": "Create a recipe with ingredient measurements using: {ingredients}", # synonym + measurement emphasis
    "format_2": "Given the ingredients {ingredients}, write a complete recipe including quantities and steps.", # explicit structure
    "format_3": "You are a chef. Develop a detailed recipe with exact measurements for: {ingredients}", # persona + precision
    "format_4": "From these ingredients: {ingredients}, plan and write a step-by-step recipe including measurements." # reasoning cue + step structure
}

# Generate recipes using specific prompt format
def generate_with_prompt(ingredients, prompt_template, model, tokenizer):
    # Format the prompt
    prompt = prompt_template.format(ingredients=ingredients)

    # Tokenise
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=128,
        truncation=True
    )

    # Move inputs to same device as model
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=256,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=3,
            temperature=0.7,
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

prompt_results = {}

for format_name, prompt_template in prompt_formats.items():
    print(f"\nTesting {format_name}: '{prompt_template}'")

    generations = []
    references = []
    coverages = []

    for idx in tqdm(sample_indices, desc=f"Generating with {format_name}"):
        input_text = test_dataset[int(idx)]['input_text']
        reference = test_dataset[int(idx)]['target_text']

        # Extract just ingredients
        if "using:" in input_text.lower():
            ingredients = input_text.split("using:")[-1].strip()
        else:
            ingredients = input_text

        # Generate
        generated = generate_with_prompt(ingredients, prompt_template, model, tokenizer)

        generations.append(generated)
        references.append(reference)

        # Calculate metrics
        coverage = ingredient_coverage(generated, ingredients)
        coverages.append(coverage)

    # Calculate BERTScore
    print(f"  Computing BERTScore...")
    P, R, F1 = bert_score(generations, references, lang="en", verbose=False, batch_size=32)

    prompt_results[format_name] = {
        'prompt': prompt_template,
        'bertscore_f1': F1.mean().item(),
        'coverage': np.mean(coverages)
    }

    print(f"Results: BERTScore F1={F1.mean():.4f}, Ingredient Coverage={np.mean(coverages):.2%}")

# Create comparison table
ablation_df = pd.DataFrame(prompt_results).T
ablation_df = ablation_df.round(4)
print(ablation_df.to_string())

# 5. Model building (Retriever)
Use fine-tuned t5 model with the adaptor to use it as a feature extractor



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Path to your saved adapter folder
model_dir = "/content/drive/MyDrive/recipe_model/final_model"

# Load adapter config
config = PeftConfig.from_pretrained(model_dir)

# Load base model
base_model = T5ForConditionalGeneration.from_pretrained(config.base_model_name_or_path)

# Load the fine-tuned LoRA adapter weights
model = PeftModel.from_pretrained(base_model, model_dir)

# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_dir)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()

In [None]:
# Batch embedding function
def get_t5_embeddings(texts, model, tokenizer, batch_size=16, device="cuda" if torch.cuda.is_available() else "cpu"):
    all_embeddings = []
    model = model.to(device)
    model.eval()

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)

        with torch.no_grad():
            encoder_outputs = model.base_model.encoder(**inputs)
            # Mean pool across tokens
            batch_embeddings = encoder_outputs.last_hidden_state.mean(dim=1)
            all_embeddings.append(batch_embeddings.cpu().numpy())

    return np.vstack(all_embeddings)

In [None]:
texts = df['ingredients'].astype(str).tolist()

# Generate and save
recipe_embeddings = get_t5_embeddings(texts, model, tokenizer, batch_size=16)
np.save("/content/drive/MyDrive/recipe_model/final_model/recipe_embeddings", recipe_embeddings)

print("Saved recipe embeddings:", recipe_embeddings.shape)
print("Saved recipe embeddings to:", model_dir)

In [None]:
# Load embeddings (without training the code chunk directly above this chunk)
recipe_embeddings = np.load("/content/drive/MyDrive/recipe_model/final_model/recipe_embeddings.npy")

In [None]:
# Retrieval function
def recommend_recipes(query, model, tokenizer, recipe_embeddings, recipe_texts, top_k=5):
    query_emb = get_t5_embeddings(query, model, tokenizer)
    sims = cosine_similarity(query_emb, recipe_embeddings)[0]
    top_idx = np.argsort(sims)[::-1][:top_k]
    results = [(recipe_texts[i], sims[i]) for i in top_idx]
    return results

texts = df['ingredients'].astype(str).tolist()

query = "chicken, rice, soy sauce"
recommendations = recommend_recipes(query, model, tokenizer, recipe_embeddings, texts)

for recipe, score in recommendations:
    print(f"{score:.3f} â€“ {recipe}")

In [None]:
# Retrieve top 5 similar recipes to the query ingredients
top_recipes = recommend_recipes(query, model, tokenizer, recipe_embeddings, df['ingredients'].tolist(), top_k=5)

# Build a prompt from retrieved recipes
query_emb = get_t5_embeddings(query, model, tokenizer)
sims = cosine_similarity(query_emb, recipe_embeddings)[0]
top_idx = np.argsort(sims)[::-1][:5] # numeric indices

context = "\n\n".join(
    " ".join(df['steps'].iloc[idx]) if isinstance(df['steps'].iloc[idx], list)
    else str(df['steps'].iloc[idx])
    for idx in top_idx
)

prompt = f"""
Available ingredients: {query}.

Reference recipes:
{context}
"""

In [None]:
print(prompt)

# 6. Model evaluation (Retriever)

## 6.1 Stratified sampling

In [None]:
# Get original dataframe indices for test set
test_df_indices = test_df.index.tolist()

# Sample from test set for faster evaluation
np.random.seed(42)
SAMPLE_SIZE = 500
sample_size = min(SAMPLE_SIZE, len(test_dataset))

# Create test queries using existing test set
test_queries = []
for i in range(sample_size):
    original_idx = test_df_indices[i] # map to original df index

    test_queries.append({
        'test_dataset_idx': i, # index in test_dataset
        'original_df_idx': original_idx, # index in original df
        'query_ingredients': test_dataset[i]['input_text'],
        'true_recipe': test_dataset[i]['target_text'],
    })

## 6.2 Key metrics

In [None]:
# Top-1 similarity
top1_similarities = []
retrieval_times = []
all_retrievals = []

for test_item in tqdm(test_queries, desc="Testing retrieval"):
    query = test_item['query_ingredients']
    query_idx = test_item['original_df_idx'] # use original df index

    # Time the retrieval
    start_time = time.time()

    # Get query embedding and similarities
    query_emb = get_t5_embeddings([query], model, tokenizer)
    sims = cosine_similarity(query_emb, recipe_embeddings)[0]

    # Get top-5 (excluding query itself)
    all_indices = np.argsort(sims)[::-1]
    top_idx = [i for i in all_indices if i != query_idx][:5]

    retrieval_time = time.time() - start_time

    # Store metrics
    top1_similarities.append(sims[top_idx[0]])
    retrieval_times.append(retrieval_time)

    # Store for later use
    all_retrievals.append({
        'query': query,
        'query_idx': query_idx,
        'top1_idx': top_idx[0],
        'top1_similarity': sims[top_idx[0]],
        'top5_indices': top_idx,
        'top5_similarities': [sims[i] for i in top_idx],
        'true_recipe': test_item['true_recipe']
    })

# Results
avg_top1_sim = np.mean(top1_similarities)

print(f"Average Top-1 Similarity:{avg_top1_sim:.4f}")

In [None]:
# Precision@5
precision_scores = []

# Define "relevant" as similarity > 0.7
RELEVANCE_THRESHOLD = 0.7

for retrieval in tqdm(all_retrievals, desc="Calculating precision"):
    print(retrieval)
    top5_sims = retrieval['top5_similarities']

    # Count how many of top-5 are relevant
    relevant_count = sum(1 for sim in top5_sims if sim >= RELEVANCE_THRESHOLD)

    precision = relevant_count / 5
    precision_scores.append(precision)

avg_precision = np.mean(precision_scores)
std_precision = np.std(precision_scores)

print(f"Average Precision@5:  {avg_precision:.2%}")

# 7. Ablation Studies (Retriever)

In [None]:
# Cosine vs Euclidean vs Manhattan
np.random.seed(42)

RETRIEVAL_SAMPLE_SIZE = 500
K_VALUES = [1, 3, 5, 7, 10]
RELEVANCE_THRESHOLD = 0.7

# Sample test queries and convert indices
retrieval_sample_indices = np.random.choice(
    len(test_dataset),
    size=min(RETRIEVAL_SAMPLE_SIZE, len(test_dataset)),
    replace=False
)
retrieval_sample_indices = [int(idx) for idx in retrieval_sample_indices]

# Define similarity metrics
similarity_methods = {
    "cosine": lambda q, E: cosine_similarity(q, E)[0],
    "euclidean": lambda q, E: 1 - (pairwise_distances(q, E, metric='euclidean')[0] / pairwise_distances(q, E, metric='euclidean')[0].max()),
    "manhattan": lambda q, E: 1 - (pairwise_distances(q, E, metric='manhattan')[0] / pairwise_distances(q, E, metric='manhattan')[0].max())
}

precision_at_k_metrics = {name: {k: [] for k in K_VALUES} for name in similarity_methods}

for i in tqdm(retrieval_sample_indices, desc="Retrieving recipes", ncols=100):
    i = int(i)
    query = test_dataset[i]['input_text']

    # Extract ingredients only
    if "using:" in query.lower():
        query_ingredients = query.split("using:")[-1].strip()
    else:
        query_ingredients = query.strip()

    # Compute query embedding
    query_emb = np.array(get_t5_embeddings([query_ingredients], model, tokenizer))

    # Compute Precision@K for each similarity method
    for name, func in similarity_methods.items():
        sims = func(query_emb, recipe_embeddings)
        # Top 20 excluding self
        all_indices = np.argsort(sims)[::-1]
        top_indices = [idx for idx in all_indices if idx != i][:20]
        top_sims = [sims[idx] for idx in top_indices]

        for k in K_VALUES:
            top_k_sims = top_sims[:k]
            relevant_count = sum(1 for sim in top_k_sims if sim >= RELEVANCE_THRESHOLD)
            precision = relevant_count / k
            precision_at_k_metrics[name][k].append(precision)

mean_precision_at_k = {
    name: [np.mean(precision_at_k_metrics[name][k]) for k in K_VALUES]
    for name in similarity_methods
}

In [None]:
precision_table = []

for name in similarity_methods:
    for k in K_VALUES:
        mean_val = np.mean(precision_at_k_metrics[name][k])
        precision_table.append({
            "Metric": name,
            "K": k,
            "Mean Precision@K": round(mean_val, 3)
        })

precision_df = pd.DataFrame(precision_table)

print(precision_df.pivot(index="K", columns="Metric", values="Mean Precision@K").round(3))

In [None]:
# Visualisation of Precision@K Degradation
plt.figure(figsize=(9, 5))

colors = {"cosine": "steelblue", "euclidean": "orange", "manhattan": "green"}

for name in similarity_methods:
    plt.plot(K_VALUES, mean_precision_at_k[name],
             marker='o', linewidth=2, markersize=6, color=colors[name], label=f'{name} (mean)')

plt.axhline(RELEVANCE_THRESHOLD, color='red', linestyle='--', label=f'Threshold ({RELEVANCE_THRESHOLD})')

plt.xlabel('K')
plt.ylabel('Precision@K')
plt.title('Similarity Metric Comparison')
plt.xticks(K_VALUES)
plt.ylim(0, 1.05)
plt.grid(alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()