In [1]:
!pip install -U bitsandbytes

!pip install -U transformers

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

In [1]:
# ✅ Confirm install
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)

from datasets import load_from_disk
from huggingface_hub import login

from google.colab import drive
import os

import pandas as pd
import re

In [2]:
root = '/content/drive'
drive.mount(root)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
os.environ['HF_TOKEN'] = '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'
login(token=os.environ["HF_TOKEN"])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [5]:
recipe_nlg_data_path = f'{root}/MyDrive/NLP-266/Project/RecipeNLG'

dataset = load_from_disk(f'file://{recipe_nlg_data_path}/processed_recipe_nlg_dataset')

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['title', 'ingredients', 'directions', 'source', 'NER', 'n_ingredients', 'n_steps', 'n_ner', 'domain', 'avg_step_length', 'total_step_length'],
        num_rows: 349677
    })
    validation: Dataset({
        features: ['title', 'ingredients', 'directions', 'source', 'NER', 'n_ingredients', 'n_steps', 'n_ner', 'domain', 'avg_step_length', 'total_step_length'],
        num_rows: 3670
    })
    test: Dataset({
        features: ['title', 'ingredients', 'directions', 'source', 'NER', 'n_ingredients', 'n_steps', 'n_ner', 'domain', 'avg_step_length', 'total_step_length'],
        num_rows: 3671
    })
    prompt: Dataset({
        features: ['title', 'ingredients', 'directions', 'source', 'NER', 'n_ingredients', 'n_steps', 'n_ner', 'domain', 'avg_step_length', 'total_step_length'],
        num_rows: 10000
    })
})


In [6]:
!pip install -U sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m470.2/470.2 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 4.1.0
    Uninstalling sentence-transformers-4.1.0:
      Successfully uninstalled sentence-transformers-4.1.0
Successfully installed sentence-transformers-5.0.0


In [18]:
from sentence_transformers import SentenceTransformer

model_id = 'multi-qa-MiniLM-L6-cos-v1'

# Load a model (change to a newer one if you like)
model = SentenceTransformer(model_id)

# Sentences to encode
sentences = [
    "This is a simple sentence.",
    "This is a simple sentence."
]

# Get dense embeddings (512-dimensional for most models)
embeddings = model.encode(sentences, convert_to_tensor=True)

# You can now use these embeddings for similarity, clustering, etc.



torch.Size([2, 384])


In [19]:
from torch.nn.functional import cosine_similarity

similarity = cosine_similarity(embeddings[0], embeddings[1], dim=0)
print(f"Cosine Similarity: {similarity.item():.4f}")


Cosine Similarity: 1.0000


In [27]:
train_data = dataset['train']

train_data, prompt_data = train_data.train_test_split(test_size=0.027802722998690493, shuffle=True, seed=42).values()

In [28]:
prompt_data

Dataset({
    features: ['title', 'ingredients', 'directions', 'source', 'NER', 'n_ingredients', 'n_steps', 'n_ner', 'domain', 'avg_step_length', 'total_step_length'],
    num_rows: 10000
})

In [56]:
dataset['train'] = train_data
dataset['prompt'] = prompt_data
dataset.save_to_disk(f'file://{recipe_nlg_data_path}/processed_recipe_nlg_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/349677 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3670 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3671 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [57]:
test = dataset['test']
prompt = dataset['prompt']

test = test.add_column("test_id", list(range(len(test))))
prompt = prompt.add_column("prompt_id", list(range(len(prompt))))


  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


In [58]:
nl = '\n'
test = test.add_column("repr", [f"{t['title']}{nl}{t['ingredients']}" for t in test])
prompt = prompt.add_column("repr", [f"{t['title']}{nl}{t['ingredients']}" for t in prompt])

In [59]:
def embed(example):
    with torch.no_grad():
        vec = model.encode(example["repr"])
    return {"embedding": vec.tolist()}

test = test.map(embed, batched=True, batch_size=16)
prompt = prompt.map(embed, batched=True, batch_size=16)

Map:   0%|          | 0/3671 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [61]:
from datasets import DatasetDict
embed_dataset = DatasetDict()
embed_dataset['test'] = test
embed_dataset['retrieval'] = prompt
embed_dataset.save_to_disk(f'file://{recipe_nlg_data_path}/test_retrieval_embeddings')

Saving the dataset (0/1 shards):   0%|          | 0/3671 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [6]:
embed_dataset = load_from_disk(f'file://{recipe_nlg_data_path}/test_retrieval_embeddings')

In [46]:
# for each test example, find the 3 closest embeddings from retrieval, and store their ids

for i, test_example in enumerate(test.select([1, 2])):


{'title': 'Double-Duty Slow-Cooked Lemony Spring Veggies', 'ingredients': '["4 medium carrots, halved lengthwise and cut into 1-inch pieces", "1 large sweet onion, coarsely chopped", "1-1/2 pounds baby red potatoes, quartered", "3 tablespoons butter, melted", "3/4 teaspoon salt", "1/4 teaspoon pepper", "1 cup frozen peas (about 4 ounces)", "1 teaspoon grated lemon zest", "2 tablespoons minced fresh chives"]', 'directions': '["Place carrots and onion in a 4-qt. slow cooker; top with potatoes. Drizzle with melted butter; sprinkle with salt and pepper. Cook, covered, on low 4-5 hours or until vegetables are tender.", "Add peas to slow cooker. Cook, covered, on high 10-15 minutes or until heated through. Stir in lemon zest.", "Reserve 3 cups potato mixture for Double-Duty Hearty Chicken & Vegetable Soup. To serve remaining potato mixture, sprinkle with chives."]', 'source': 'Gathered', 'NER': '["carrots", "sweet onion", "baby red potatoes", "butter", "salt", "pepper", "frozen peas", "lemon

In [7]:
!pip install faiss-gpu-cu12




In [24]:
import numpy as np
import faiss

# Normalize embeddings to unit length (for cosine similarity via inner product or L2)
test = embed_dataset['test']
embeddings_np = np.array(test["embedding"])
normed = embeddings_np / np.linalg.norm(embeddings_np, axis=1, keepdims=True)

# Add normalized embeddings back to the dataset
test = test.remove_columns("embedding")
test = test.add_column("embedding", normed.tolist())

# Add FAISS index
test.add_faiss_index(column="embedding", metric_type=faiss.METRIC_INNER_PRODUCT)


  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


  0%|          | 0/4 [00:00<?, ?it/s]

Dataset({
    features: ['title', 'ingredients', 'directions', 'source', 'NER', 'n_ingredients', 'n_steps', 'n_ner', 'domain', 'avg_step_length', 'total_step_length', 'test_id', 'repr', 'embedding'],
    num_rows: 3671
})

In [25]:
# Normalize embeddings to unit length (for cosine similarity via inner product or L2)
retrieval = embed_dataset['retrieval']
embeddings_np_r = np.array(retrieval["embedding"])
normed_r = embeddings_np_r / np.linalg.norm(embeddings_np_r, axis=1, keepdims=True)

# Add normalized embeddings back to the dataset
retrieval  = retrieval.remove_columns("embedding")
retrieval  = retrieval.add_column("embedding", normed_r.tolist())

# Add FAISS index
retrieval.add_faiss_index(column="embedding", metric_type=faiss.METRIC_INNER_PRODUCT)

  return cls._concat_blocks(pa_tables_to_concat_vertically, axis=0)


  0%|          | 0/10 [00:00<?, ?it/s]

Dataset({
    features: ['title', 'ingredients', 'directions', 'source', 'NER', 'n_ingredients', 'n_steps', 'n_ner', 'domain', 'avg_step_length', 'total_step_length', 'prompt_id', 'repr', 'embedding'],
    num_rows: 10000
})

In [34]:
# Normalize the query vector
from pprint import pprint
query_vector = test[0]["embedding"]
query_vector = query_vector / np.linalg.norm(query_vector)

# Search for top-1 most similar vector
scores, retrieved_examples = retrieval.get_nearest_examples(
    index_name="embedding",
    query=query_vector,
    k=3
)

pprint(f"Reference: {test[0]['repr']}")
pprint(f"Nearest Examples: {retrieved_examples['repr']}")  # Most similar text
pprint(scores)  # Cosine similarity score


("Reference: Dinah'S Stuffed Mushrooms\n"
 '["20 fresh mushrooms, stems removed", "2 (6.5 ounce) cans minced clams, '
 'drained", "2 cloves garlic, peeled and minced", "1/2 cup grated Parmesan '
 'cheese", "1 small onion, finely chopped", "3/4 cup dry bread crumbs", "1/2 '
 'cup chopped green bell pepper", "2 tablespoons dried parsley", "2 '
 'tablespoons Italian-style seasoning", "ground black pepper to taste", "1 1/2 '
 'cups butter, melted", "1/2 cup shredded mozzarella cheese"]')
('Nearest Examples: [\'Stuffed Mushrooms (Gefullte Pilze)\\n["12 whole '
 'Mushrooms", "1 Tablespoon Olive Oil", "13 cups Onions, Finely Diced", "4 '
 'Tablespoons Plain Breadcrumbs", "4 Tablespoons Whipping Cream, Divided", "1 '
 'pinch Cayenne Pepper", "1 Tablespoon Fresh Parsley, Chopped", "1/2 teaspoons '
 'Salt", "1/4 teaspoons Black Pepper", "1/4 teaspoons Paprika", "2 Tablespoons '
 'Parmesan Cheese, Shredded"]\', \'Stuffed Mushrooms\\n["2 tsp oil", "1 None '
 'red onion, finely chopped", "2 cloves 

In [35]:
retrieved_examples['prompt_id']

[1773, 2235, 525]

In [42]:
def get_nearest_examples(test_example, k=3):
    # Normalize the query vector
    query_vector = test_example["embedding"]
    query_vector = query_vector / np.linalg.norm(query_vector)
    scores, retrieved_examples = retrieval.get_nearest_examples(
        index_name="embedding",
        query=query_vector,
        k=k)

    return {"retrieval_neighbors": retrieved_examples['prompt_id']}

test = test.map(get_nearest_examples, batched=False)

Map:   0%|          | 0/3671 [00:00<?, ? examples/s]

In [45]:
test['retrieval_neighbors'][500]

[3587, 9713, 1944]

In [47]:
test[500]['title']

'Yummy chocolate egg less mug cake'

In [51]:
retrieval.drop_index("embedding")

In [54]:
retrieval.select(test['retrieval_neighbors'][500])['title']

['Chocolate Cake ', 'Chocolate Cake Iii', 'Chocolate Cake']

In [57]:
from datasets import DatasetDict

embed_dataset = DatasetDict()
embed_dataset['test'] = test
embed_dataset['retrieval'] = retrieval
embed_dataset.save_to_disk(f'file://{recipe_nlg_data_path}/test_retrieval_embeddings')

Saving the dataset (0/1 shards):   0%|          | 0/3671 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]