In [45]:
# 1. Install dependencies
!pip install --quiet transformers datasets accelerate evaluate

import os
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

In [46]:
from datasets import load_dataset

dataset = load_dataset("BhavaishKumar112/Food_Recipe")
print("Dataset columns:", dataset['train'].column_names)
print("Example entry:\n", dataset['train'][0])  # inspect first example, then set the field names below

Dataset columns: ['name', 'description', 'cuisine', 'course', 'diet', 'ingredients_name', 'ingredients_quantity', 'prep_time (in mins)', 'cook_time (in mins)', 'instructions', 'image_url']
Example entry:
 {'name': 'Greek Style Broken Wheat Salad with Chickpeas, Fresh Mint & Feta - A Mediterranean Powerhouse', 'description': 'This Greek Style Broken Wheat Salad with Chickpeas, Fresh Mint & Feta is a refreshing and nutritious Mediterranean-inspired dish. The nutty flavor of broken wheat pairs perfectly with protein-rich chickpeas, fresh mint, and crumbly feta cheese, creating a balanced, fiber-packed meal. Lightly dressed with lemon juice and olive oil, this salad is a healthy option that bursts with flavor. It’s an ideal meal for lunch or dinner, or even as a side dish for a Mediterranean feast.', 'cuisine': 'Greek', 'course': 'World Breakfast', 'diet': 'Diabetic Friendly', 'ingredients_name': 'Broken Wheat (Dalia/ Godumai Rava), Kabuli Chana (White Chickpeas), Garlic, Onions, Carrot (G

In [48]:
print("Example entry:\n", dataset['train'][0][dataset['train'].column_names[6]])  # inspect first example, then set the field names below

Example entry:
 1 cup Broken Wheat (Dalia/ Godumai Rava)  1/2 cup Kabuli Chana (White Chickpeas) , soaked for 8 hours and boiled  4 cloves Garlic , finely chopped  2 Onions , thinly sliced  1 Carrot (Gajjar) , finely chopped  1 Red Bell pepper (Capsicum) , finely chopped  1/2 cup Broccoli , cut into tiny florets  1/8 teaspoon Cinnamon Powder (Dalchini)  3 tablespoons Lemon juice  1/2 teaspoon Caster Sugar  1/2 cup Feta Cheese , crumbled  1/4 cup Mint Leaves (Pudina) , chopped  3 tablespoon Extra Virgin Olive Oil Salt and Pepper , for seasoning


In [49]:
# === SET THESE BASED ON CHECK ABOVE ===
key_title = "name"                   # recipe title field
key_ing_name = "ingredients_name"    # ingredient names field
key_ing_qty  = "ingredients_quantity"# ingredient quantities field
key_inst = "instructions"            # instructions field
# ======================================

In [53]:
# 3. Split into train/validation and filter null entries
split = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_ds = split["train"].filter(
    lambda x: x[key_ing_name] is not None and x[key_ing_qty] is not None and x[key_inst] is not None
)
eval_ds  = split["test"].filter(
    lambda x: x[key_ing_name] is not None and x[key_ing_qty] is not None and x[key_inst] is not None
)


In [55]:
# 4. Preprocess: build input-output pairs

def preprocess(example):
    title = example[key_title].strip()
    names = example[key_ing_name].strip()
    qtys  = example[key_ing_qty].strip()
    instr = example[key_inst].strip().replace("\n", " ")

    # Input prompt uses ingredient names only
    prompt = f"Suggest a detailed recipe given ingredients: {names}."
    # Target includes title, ingredients with quantities, and instructions
    #name_list = [n.strip() for n in names.split(",")]
    #qty_list  = [q.strip() for q in qtys.split(",")]
    # ensure same length
    #pairs = zip(name_list, qty_list) if len(name_list)==len(qty_list) else [(n, '') for n in name_list]
    #ingredients_info = "; ".join([f"{n}: {q}" for n, q in pairs])
    target = (
        f"Recipe name: {title}\n"
        f"Ingredients & quantities: {qtys}\n"
        f"Instructions: {instr}"
    )
    return {"input_text": prompt, "target_text": target}

# Apply preprocessing
train_ds = train_ds.map(preprocess, remove_columns=dataset['train'].column_names)
eval_ds  = eval_ds.map(preprocess,  remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/6385 [00:00<?, ? examples/s]

Map:   0%|          | 0/711 [00:00<?, ? examples/s]

In [None]:
# 4. Preprocess: build input-output pairs

def preprocess(example):
    title = example[key_title].strip()
    qtys  = example[key_ing_qty].strip()
    instr = example[key_inst].strip().replace("\n", " ")

    # Input prompt uses ingredient names only
    prompt = f"Suggest a detailed recipe given ingredients: {names}."
    
    # Target includes title, ingredients with quantities, and instructions
    target = (
        f"Recipe name: {title}\n"
        f"Ingredients & quantities: {qtys}\n"
        f"Instructions: {instr}"
    )
    return {"input_text": prompt, "target_text": target}

# Apply preprocessing
train_ds = train_ds.map(preprocess, remove_columns=dataset['train'].column_names)
eval_ds  = eval_ds.map(preprocess,  remove_columns=dataset['train'].column_names)

In [57]:
# 5. Load tokenizer and model
model_name = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model     = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [58]:
# 6. Tokenize data
def tokenize_fn(batch):
    inputs  = tokenizer(batch["input_text"],  max_length=512, truncation=True)
    targets = tokenizer(batch["target_text"], max_length=512, truncation=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

train_tok = train_ds.map(tokenize_fn, batched=True, remove_columns=["input_text","target_text"])
eval_tok  = eval_ds.map(tokenize_fn,  batched=True, remove_columns=["input_text","target_text"])

Map:   0%|          | 0/6385 [00:00<?, ? examples/s]

Map:   0%|          | 0/711 [00:00<?, ? examples/s]

In [61]:
# 7. Data collator

from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [63]:
import wandb

# 初始化 W&B
wandb.init(project="recipes")


wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: yangsunchengrui (yangsunchengrui-none). Use `wandb login --relogin` to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

In [64]:
# 9. Training arguments
from transformers import Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir="C:/Users/YSCR/Desktop/study/hkbu/semester2/7065/course project",
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=1000,
    logging_strategy="steps",
    logging_steps=200,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    predict_with_generate=True,
    report_to="wandb"
)

In [67]:
# 10. Trainer setup
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Seq2SeqTrainer(


In [69]:
# 10. Train!
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
500,2.2523,1.88208
1000,2.0326,1.760895
1500,1.9037,1.69743
2000,1.8342,1.658525
2500,1.8226,1.632445
3000,1.7547,1.611899
3500,1.7667,1.600562
4000,1.7569,1.58934
4500,1.7576,1.584737


TrainOutput(global_step=4791, training_loss=1.9127677036859476, metrics={'train_runtime': 7695.9095, 'train_samples_per_second': 2.489, 'train_steps_per_second': 0.623, 'total_flos': 2689576703262720.0, 'train_loss': 1.9127677036859476, 'epoch': 3.0})

In [73]:
# 11. Save the fine-tuned model and tokenizer
model.save_pretrained("C:/Users/YSCR/Desktop/study/hkbu/semester2/7065/course project/recipe_model")
tokenizer.save_pretrained("C:/Users/YSCR/Desktop/study/hkbu/semester2/7065/course project/recipe_model")
print("Fine-tuned model saved to C:/Users/YSCR/Desktop/study/hkbu/semester2/7065/course project/recipe_model")

Fine-tuned model saved to C:/Users/YSCR/Desktop/study/hkbu/semester2/7065/course project/recipe_model


In [75]:
# 12. Test generation
def generate_recipe(ingredients_list, max_length=256):
    prompt = f"Suggest a detailed recipe given ingredients: {ingredients_list}."
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_length=max_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [81]:
# Example usage
test_ing = "tomato, basil, garlic, olive oil, salt"
print(generate_recipe(test_ing))

Recipe name: Tomato Basil Pesto Recipe Ingredients & quantities: 1 tomato , finely chopped 1 basil , finely chopped 2 cloves garlic , finely chopped 1 tablespoon olive oil Salt , to taste Instructions: To begin making the Tomato Basil Pesto Recipe, heat olive oil in a heavy bottomed pan over medium heat. Add the tomatoes, basil, garlic and saute for a few seconds.Add the tomatoes and saute for a few seconds.Add the tomatoes and saute for a few seconds.Add the tomatoes and saute for a few seconds.Add the tomatoes and saute for a few seconds.Add the tomatoes and saute for a few seconds.Add the tomatoes and saute for a few seconds until the tomatoes are soft and translucent.Add the basil and saute for a few seconds.Add the tomatoes and saute for a few seconds.Add the tomatoes and saute for a few seconds until the tomatoes are soft and translucent.Once the tomatoes are soft, add the garlic and saute for a few seconds until the tomatoes are soft and translucent.Once the tomatoes are soft, a

In [79]:
#from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
#import torch

# 1. 定义模型路径（替换为你的实际路径）
#model_path = "C:/Users/YSCR/Desktop/study/hkbu/semester2/7065/course project/recipe_model"

# 2. 加载模型和分词器
#tokenizer = AutoTokenizer.from_pretrained(model_path)
#model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to("cuda" if torch.cuda.is_available() else "cpu")

# 3. 定义生成函数
#def generate_recipe(ingredients_list):
    """
    输入食材列表，生成完整食谱
    Input: ingredients list (e.g., "tomato, onion, garlic")
    Output: Full recipe with name, ingredients, and instructions
    """
    #prompt = f"Suggest a detailed recipe given ingredients: {ingredients_list}."
    #inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    #outputs = model.generate(
        #**inputs,
        #max_length=512,          # 最大生成长度
        #num_beams=4,             # 束搜索参数
        #early_stopping=True      # 提前停止
    #)
    #return tokenizer.decode(outputs[0], skip_special_tokens=True)

# 4. 测试调用
#test_ingredients = "chicken, rice, soy sauce"
#print(generate_recipe(test_ingredients))


Recipe name: Soy Chicken Rice Recipe Ingredients & quantities: 2 cups chicken , cut into cubes 1 cup rice 1 teaspoon soy sauce Instructions: To begin making the Soy Chicken Rice Recipe, heat a pan with oil and add the chicken to the pan. Add the rice and soy sauce to the pan. Add the rice and soy sauce to the pan and cook until the rice is cooked through.Once the rice is cooked through, turn off the heat and allow the rice to cool down.Once the rice is cooked, add the rice and soy sauce to the pan and allow it to cool down.Serve Soy Chicken Rice Rice Recipe along with Soy Chicken Rice Recipe for a weeknight dinner.


In [77]:
# Example usage
test_ing = "tomato, basil, garlic, olive oil, salt"
print(generate_recipe(test_ing))

Recipe name: Tomato Basil Curry Recipe Ingredients & quantities: 1 tomato , sliced 1 teaspoon basil 1 teaspoon garlic 1 tablespoon olive oil salt , to taste Instructions: To begin making the Tomato Basil Curry Recipe, first prep all the ingredients and keep them ready. In a large bowl, combine the tomatoes, basil, garlic, olive oil and salt.Once the tomatoes are steamed, drain the water from the tomatoes and keep it aside.Place the tomatoes in a bowl and cover it with a lid and allow it to cool down.Heat olive oil in a small pan and add the tomatoes and garlic and sauté until the tomatoes are softened.Serve the Tomato Basil Curry Recipe as a tea time snack along with a cup of water for a weeknight dinner.


In [85]:
!pip install rouge_score absl-py

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting absl-py
  Using cached absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)

In [99]:
# 导入评估库
import evaluate
import numpy as np
from tqdm import tqdm  # 进度条

# 加载指标
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

def evaluate_model(model, tokenizer, eval_dataset, num_samples=50):
    """
    评估模型生成质量（BLEU/ROUGE）和预测准确率
    Evaluate model generation quality (BLEU/ROUGE) and prediction accuracy
    """
    model.eval()  # 切换为评估模式
    
    # 随机选择部分评估数据
    eval_samples = eval_dataset.select(range(min(num_samples, len(eval_dataset))))
    
    # 存储结果
    predictions, references = [], []
    correct_predictions = 0
    
    for sample in tqdm(eval_samples, desc="Evaluating"):
        # 生成预测
        input_text = sample["input_text"]
        inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs, max_length=512)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # 获取真实值
        target_text = sample["target_text"]
        
        # 记录结果
        predictions.append(prediction)
        references.append([target_text])  # BLEU需要reference为列表形式
        
        # 简单检查菜名是否预测正确（基础准确率）
        if "Recipe name:" in prediction and "Recipe name:" in target_text:
            pred_name = prediction.split("Recipe name:")[1].split("\n")[0].strip()
            true_name = target_text.split("Recipe name:")[1].split("\n")[0].strip()
            if pred_name == true_name:
                correct_predictions += 1
    
    # 计算指标
    bleu_results = bleu_metric.compute(predictions=predictions, references=references)
    rouge_results = rouge_metric.compute(predictions=predictions, references=references)
    accuracy = correct_predictions / len(eval_samples)
    
    return {
        "bleu": bleu_results["bleu"],
        "rouge1": rouge_results["rouge1"],
        "rouge2": rouge_results["rouge2"],
        "rougeL": rouge_results["rougeL"],
        "name_accuracy": accuracy
    }


In [115]:
# 运行评估
results = evaluate_model(model, tokenizer, eval_ds, num_samples=50)

Evaluating: 100%|██████████| 50/50 [06:56<00:00,  8.34s/it]


In [123]:
print("Evaluation result:",results)

Evaluation result: {'bleu': 0.39356997424306843, 'rouge1': 0.45960177281704033, 'rouge2': 0.32018395568410966, 'rougeL': 0.4106102389669763, 'name_accuracy': 0.73277075}
