In [1]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import time

In [3]:
df = pd.read_csv('/kaggle/input/recipes/enriched_cleaned_recipes.csv')
df

Unnamed: 0.1,Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,n_steps,steps,description,...,rating_mean,rating_count,all_reviews,calories,total_fat_pdv,sugar_pdv,sodium_pdv,protein_pdv,saturated_fat_pdv,carbohydrates_pdv
0,0,arriba baked winter squash mexican style,137739.0,55.0,47892.0,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...",11.0,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,...,5.0,3.0,[' I used an acorn squash and recipe#137681 Sw...,51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,130923,marinated greek feta,141627.0,5.0,197023.0,2005-10-17,"['15-minutes-or-less', 'time-to-make', 'course...",4.0,['cut the feta cheese into 2cm cubes and place...,"i found this simple, super-quick-to-make recip...",...,5.0,8.0,['Just delicious and a wonderful way to keep m...,280.9,18.0,39.0,23.0,36.0,22.0,9.0
2,130921,marinated goat cheese with fresh oregano,63342.0,65.0,56081.0,2003-05-28,"['weeknight', 'time-to-make', 'course', 'main-...",2.0,"['place oil , oregano and pepper in a narrow s...",another one from the atkins low carb cook book...,...,5.0,1.0,"[""This was really good! I wasn't sure if I li...",284.7,16.0,10.0,24.0,46.0,9.0,7.0
3,130920,marinated goat cheese spread dip,258115.0,195.0,444132.0,2007-10-10,"['time-to-make', 'course', 'main-ingredient', ...",6.0,['arrange cheese in a shallow dish in a single...,a decedent spread that wins raves at potlucks ...,...,5.0,3.0,"[""I'm not a big fan of goat cheese, but made t...",555.9,58.0,13.0,56.0,83.0,73.0,4.0
4,130917,marinated goat cheese,56514.0,2885.0,62043.0,2003-03-18,"['weeknight', 'time-to-make', 'course', 'main-...",4.0,['cut goat cheese into 1 cm rounds and place i...,this recipe is from the ottawa citizen. this i...,...,5.0,2.0,['This is a delicious appetizer to have on han...,325.6,13.0,10.0,27.0,13.0,27.0,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160867,2673,affordable basic restaurant style tomato and m...,115199.0,15.0,118159.0,2005-04-04,"['15-minutes-or-less', 'time-to-make', 'course...",6.0,"['in a medium saucepan', 'bring salted water ,...",ever had this comfort soup in a restaurant? t...,...,3.9,10.0,"['hello Andre, I have finally come on to rate...",291.2,26.0,73.0,7.0,13.0,59.0,9.0
160868,174846,roast pork pernil puerto rican style,316839.0,255.0,812387.0,2008-07-31,"['main-ingredient', 'cuisine', 'preparation', ...",10.0,['the night before peel garlic and with a piln...,"tasty roast pork, puerto rican style, marine o...",...,3.9,10.0,"[""Foolproof recipe for making a flavorful and ...",1980.0,301.0,8.0,172.0,70.0,591.0,5.0
160869,132386,mean green juice for juicer,492690.0,17.0,442988.0,2012-12-31,"['30-minutes-or-less', 'time-to-make', 'course...",2.0,"['wash all ingredients thoroughly', 'run throu...","this is the recipe for ""mean green"" juice used...",...,3.9,10.0,['DH and I are juicing and this is a great go ...,499.2,32.0,17.0,28.0,37.0,61.0,20.0
160870,62781,creamy cocoa mix extra large batch,107668.0,15.0,154044.0,2005-01-05,"['15-minutes-or-less', 'time-to-make', 'course...",3.0,['mix all ingredients in a large bowl and blen...,"i like this cocoa for many reasons, only one o...",...,3.9,10.0,"['I am feeling under-the-weather today, and ne...",1511.8,140.0,210.0,232.0,177.0,151.0,26.0


In [4]:
def combine_fields(row):
    try:
        tags = row['tags'] if isinstance(row['tags'], str) else ', '.join(eval(row['tags']))
    except Exception:
        tags = ''
    try:
        ingredients = row['ingredients'] if isinstance(row['ingredients'], str) else ', '.join(eval(row['ingredients']))
    except Exception:
        ingredients = ''
    description = row['description'] if pd.notnull(row['description']) else ""
    return f"{row['name']}. Tags: {tags}. Ingredientes: {ingredients}. Descrição: {description}"

df['combined_text'] = df.apply(combine_fields, axis=1)

model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

start_time = time.time()
batch_size = 256
texts = df['combined_text'].tolist()

embeddings = model.encode(texts, batch_size=batch_size, show_progress_bar=True)
end_time = time.time()
print(f"Time for compute embeddings: {end_time - start_time:.2f} seconds")

embeddings = np.array(embeddings).astype('float32')
embedding_dim = embeddings.shape[1]

index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)
print(f"Total recipes indexed: {index.ntotal}")

np.save('embeddings.npy', embeddings)
faiss.write_index(index, 'faiss_index.index')

print("Embeddings and FAISS index exported!")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/629 [00:00<?, ?it/s]

Time for compute embeddings: 395.96 seconds
Total recipes indexed: 160872
Embeddings and FAISS index exported!
