In [2]:
pip install transformers torch accelerate




In [6]:
import pandas as pd

csv_path = "/content/3A2M_EXTENDED.csv"

df = pd.read_csv(
    csv_path,
    engine='python',
    on_bad_lines='skip',
    quotechar='"'
)

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (159492, 6)


Unnamed: 0,title,NER,Extended_NER,genre,label,directions
0,\t Arugula Pomegranate Salad,"[""baby spinach"", ""baby arugula"", ""pomegranate ...","['alfalfa sprouts', 'baby spinach', 'baby arug...",vegetables,4,"[""Toss together spinach and arugula, then plac..."
1,\t Black Bean And Turkey Chili,"[""olive oil"", ""yellow onion"", ""garlic"", ""groun...","['one', 'yellow onion', 'tomato paste', 'about...",sides,8,"[""Dice the onion and mince the garlic. Add the..."
2,\t Finger Lickin' Tofu Nuggets,"[""extra firm"", ""almond flour"", ""nutritional ye...","['extra firm', '2', 'coconut oil', 'almond flo...",nonveg,3,"[""Wrap the tofu in a clean tea towel and press..."
3,\t Jerk Beef Stew With Carrots And Tomatoes,"[""olive oil"", ""boneless beef chuck"", ""onion"", ...","['boneless beef chuck', '2', 'Saute', 'onion',...",vegetables,4,"[""Preheat oven to 350 degrees F."", ""Heat the o..."
4,\t Pomegranate Couscous Salad,"[""pomegranate arils"", ""whole wheat couscous"", ...","['whole wheat couscous', '10 minutes', 'lemon ...",vegetables,4,"[""Place couscous in a bowl with 11/2 cups of h..."


In [7]:

def format_recipe(row):
    try:
        ingredients = ", ".join(eval(row['Extended_NER']))
        directions = " ".join(eval(row['directions']))
    except:
        ingredients = ""
        directions = ""
    return f"Title: {row['title'].strip()}\nIngredients: {ingredients}\nInstructions: {directions}\n"

texts = df.apply(format_recipe, axis=1).tolist()

print(texts[0])


Title: Arugula Pomegranate Salad
Ingredients: alfalfa sprouts, baby spinach, baby arugula, pomegranate arils, persimmon
Instructions: Toss together spinach and arugula, then place in your serving bowl. Remove the stem and leaves of the persimmon, then slice into thin wedges. Arrange the persimmon on top of the spinach and arugula. Garnish with pomegranate arils and alfalfa sprouts.



In [8]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

max_length = 512
encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [14]:
import torch
from torch.utils.data import Dataset

class RecipeDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = item['input_ids'].clone()
        return item

dataset = RecipeDataset(encodings)


In [15]:
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))


Embedding(50257, 768)

In [22]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm


subset_size = 1000
subset_dataset = torch.utils.data.Subset(dataset, range(subset_size))

train_loader = DataLoader(subset_dataset, batch_size=4, shuffle=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.train()

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        loop.set_postfix(loss=loss.item())


model.save_pretrained("recipe_gpt2_demo")
tokenizer.save_pretrained("recipe_gpt2_demo")

model.eval()
sample_prompts = [
    "Title: Spicy Chicken Curry\nIngredients: chicken, onions, garlic, tomatoes\nInstructions:",
    "Title: Vegan Chocolate Cake\nIngredients: flour, cocoa powder, sugar, almond milk\nInstructions:"
]

for prompt in sample_prompts:
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    generated_ids = model.generate(
        input_ids,
        max_length=200,
        num_return_sequences=1,
        do_sample=True,
        top_k=50,
        top_p=0.95
    )
    recipe = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print("\nGenerated Recipe:\n", recipe)


Epoch 1: 100%|██████████| 250/250 [01:03<00:00,  3.96it/s, loss=0.663]
Epoch 2: 100%|██████████| 250/250 [01:02<00:00,  3.98it/s, loss=1.32]
Epoch 3: 100%|██████████| 250/250 [01:02<00:00,  3.97it/s, loss=0.655]
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Generated Recipe:
 Title: Spicy Chicken Curry
Ingredients: chicken, onions, garlic, tomatoes
Instructions: Place chicken in a large bowl. Add the rice, tomatoes and garlic. Whisk. Pour rice mixture over the chicken and toss. Transfer to a colander. When corned, add all remaining ingredients. Bring to a simmer and cook for 15 minutes, until the rice is done to desired doneness.


Generated Recipe:
 Title: Vegan Chocolate Cake
Ingredients: flour, cocoa powder, sugar, almond milk
Instructions: Heat oil and cook till browned. Add almonds and cook till cool. Add flour, cocoa powder, and salt, stir till smooth. Pour into greased loaf pan. Bake at 350° for 45 minutes.



In [23]:
import torch
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from math import exp
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

model = GPT2LMHeadModel.from_pretrained("recipe_gpt2_demo")
tokenizer = GPT2Tokenizer.from_pretrained("recipe_gpt2_demo")
tokenizer.pad_token = tokenizer.eos_token
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()

eval_subset_size = 100
eval_dataset = torch.utils.data.Subset(dataset, range(eval_subset_size))
eval_loader = DataLoader(eval_dataset, batch_size=2, shuffle=False)

total_loss = 0
total_tokens = 0
with torch.no_grad():
    for batch in eval_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item() * input_ids.size(1)
        total_tokens += input_ids.size(1)

perplexity = exp(total_loss / total_tokens)
print(f"Perplexity on eval subset: {perplexity:.2f}")

smooth = SmoothingFunction().method1
sample_prompts = [
    "Title: Spicy Chicken Curry\nIngredients: chicken, onions, garlic, tomatoes\nInstructions:",
    "Title: Vegan Chocolate Cake\nIngredients: flour, cocoa powder, sugar, almond milk\nInstructions:"
]

for prompt in sample_prompts:
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    generated_ids = model.generate(
        input_ids,
        max_length=200,
        num_return_sequences=1,
        do_sample=True,
        top_k=50,
        top_p=0.95
    )
    generated_recipe = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    reference_recipe = prompt
    bleu = sentence_bleu([reference_recipe.split()], generated_recipe.split(), smoothing_function=smooth)
    print("\nPrompt:\n", prompt)
    print("Generated Recipe:\n", generated_recipe)
    print(f"BLEU score: {bleu:.4f}")


Perplexity on eval subset: 3.03

Prompt:
 Title: Spicy Chicken Curry
Ingredients: chicken, onions, garlic, tomatoes
Instructions:
Generated Recipe:
 Title: Spicy Chicken Curry
Ingredients: chicken, onions, garlic, tomatoes
Instructions: Stir all ingredients into pan. Cook over low heat for one hour. If desired, add extra sauce or water if desired.

BLEU score: 0.2858

Prompt:
 Title: Vegan Chocolate Cake
Ingredients: flour, cocoa powder, sugar, almond milk
Instructions:
Generated Recipe:
 Title: Vegan Chocolate Cake
Ingredients: flour, cocoa powder, sugar, almond milk
Instructions: Mix together. Pour into 9x13x2 inch pan. Bake at 350° until brown and crumbly, about 45 minutes.

BLEU score: 0.3800


In [24]:
prompt = "Title: Chocolate Chip Cookies\nIngredients: flour, sugar, butter, eggs, chocolate chips\nInstructions:"
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(model.device)

output = model.generate(
    input_ids,
    max_length=200,
    num_return_sequences=1,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.8
)

generated_recipe = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Recipe:\n", generated_recipe)


Generated Recipe:
 Title: Chocolate Chip Cookies
Ingredients: flour, sugar, butter, eggs, chocolate chips
Instructions: Mix together dry ingredients and add butter and eggs. Pour into greased and floured cookie sheets. Bake at 350° for 2 to 3 minutes.

