<a href="https://colab.research.google.com/github/danielsaniam/ai-recipe-finder/blob/main/IamCooking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install -U sentence-transformers
!pip install pinecone-client
!pip install gradio
!pip install datasets



In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from ast import literal_eval
from datasets import load_dataset

In [13]:
dataset = load_dataset("somosnlp/recetas-cocina")

Downloading readme:   0%|          | 0.00/158 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/39.9M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'url', 'ingredients', 'steps', 'uuid'],
        num_rows: 28238
    })
})

In [15]:
df = pd.DataFrame(dataset['train'])

In [16]:
def concatenar_lista(lista):
    lista = literal_eval(lista)
    return ' '.join(lista)

In [18]:
def string_to_list(lista):
    lista = literal_eval(lista)
    return lista

In [21]:
df = df.fillna(' ')
df['text'] = df.apply(lambda x : x['title']+' '+x['ingredients']+' '+x['steps'], axis=1)

In [22]:
df.head(3)

Unnamed: 0,title,url,ingredients,steps,uuid,text
0,Arepas de Queso,https://www.mycolombianrecipes.com/es/arepas-d...,1 taza de harina de arepa blanca o amarilla\r\...,"Combine la harina de maíz, agua caliente, el q...",86af61e4-e16a-11ed-9591-a96d6180cd25,Arepas de Queso 1 taza de harina de arepa blan...
1,Sudado de Pollo,https://www.mycolombianrecipes.com/es/sudado-d...,8 muslos de pollo sin la piel\r\n1 cucharada d...,"En una olla grande, caliente el aceite vegetal...",86af61e5-e16a-11ed-abef-a96d6180cd25,Sudado de Pollo 8 muslos de pollo sin la piel\...
2,Sancocho Trifásico,https://www.mycolombianrecipes.com/es/sancocho...,1 taza de cebolla picada\r\n1 pimientón rojo f...,"Coloque la cebolla, el pimientón, el ajo y el ...",86af61e6-e16a-11ed-bcf5-a96d6180cd25,Sancocho Trifásico 1 taza de cebolla picada\r\...


In [23]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [24]:
embeddings = model.encode(df['text'],batch_size=64,show_progress_bar=True)

Batches:   0%|          | 0/442 [00:00<?, ?it/s]

In [25]:
df['embeddings'] = embeddings.tolist()
df['ids'] = df.index
df['ids'] = df['ids'].astype('str')

In [31]:
import pinecone
from getpass import getpass

In [32]:
pincone_api = getpass('Enter the secret value: ')

Enter the secret value: ··········


In [33]:
pinecone.init(api_key=pincone_api, environment="gcp-starter")

In [34]:
dimensions_embeddings = len(df['embeddings'][0])
index_name = 'recipes-embeddings'
all_index = pinecone.list_indexes()
if index_name in all_index:
    index = pinecone.Index(index_name)
else:
    pinecone.create_index(index_name, dimension=dimensions_embeddings, metric="cosine")
    index = pinecone.Index(index_name)

In [None]:
from tqdm.auto import tqdm

# we will use batches of 64
batch_size = 64

for i in tqdm(range(0, len(df), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(df))
    # extract batch
    batch = df[i:i_end]
    # generate embeddings for batch
    ids = batch['ids']
    emb = batch['embeddings']
    metadata = batch.drop(['ids','embeddings','text','url'],axis=1).to_dict('records')

    # add all to upsert list
    to_upsert = list(zip(ids, emb,metadata))
    # upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)

# check that we have all vectors in index
index.describe_index_stats()

  0%|          | 0/382 [00:00<?, ?it/s]



{'dimension': 384,
 'index_fullness': 0.1,
 'namespaces': {'': {'vector_count': 24402}},
 'total_vector_count': 24402}

In [36]:
query = 'receta pata el inicio del dia'
query_vector = model.encode(query).tolist()

responses = index.query(
  vector=query_vector,
  top_k=3,
  include_metadata=True,
  filter ={}
)

In [37]:
responses

{'matches': [{'id': '27006',
              'metadata': {'ingredients': '2 kilos de patatas 6 almendras 1 '
                                          'rebanada de pan 2 dientes de ajo 3 '
                                          'cucharadas de aceite 1 papel de '
                                          'azafrán (una carterita de '
                                          'colorante) Perejil Sal Pimienta',
                           'steps': '1 Se quita la piel a las patatas, se '
                                    'lavan y después se parten en trozos '
                                    'regulares y se colocan en una cacerola. '
                                    'En una sartén pequeña se echa el aceite, '
                                    'se calienta y se añaden los dientes de '
                                    'ajo, las almendras, la rebanada de pan y '
                                    'el perejil. 2 Bien dorado todo, se saca y '
                                    'se

In [39]:
def search(query, top_k):
    query_vector = model.encode(query).tolist()

    responses = index.query(
        vector=query_vector,
        top_k=top_k,
        include_metadata=True,
        filter={}
    )

    # Format the responses for better display
    response_data = []
    for response in responses['matches']:
        response_data.append({
            'Title': response['metadata']['title'],
            'Ingredients': response['metadata']['ingredients'],
            'Steps': response['metadata']['steps'],
            'URL': response['metadata']['url'],
            'Score': response['score'],
        })

    df = pd.DataFrame(response_data)
    return df



In [40]:
import gradio as gr

iface = gr.Interface(
    fn=search,
    inputs=[
        gr.Textbox(lines=5, placeholder="Escribe aquí tu consulta...", label="Consulta")

    ],
    outputs=gr.Dataframe(type="pandas", label="Resultados"),
    title="Buscador de recetas",
    description="Introduce tu consulta para buscar recetas.",
)

# Launch the interface
iface.launch()



Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://5aaddc1a69bb94e9cd.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


