In [None]:
import os
from dotenv import load_dotenv
from pprint import pprint
import numpy as np
import json
import math

import pandas as pd

import re

import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings, Settings

from IPython.display import Markdown

from PyPDF2 import PdfReader


import time

from tqdm import tqdm

import google.generativeai as genai

import ollama
from ollama import chat
from ollama import embeddings
from ollama import EmbeddingsResponse
from ollama import ChatResponse

from ddg import Duckduckgo

def extract_pages_from_pdf(file_path):
    pdf_reader = PdfReader(file_path)
    num_pages = len(pdf_reader.pages)

    page_offset = 0
    text = ""

    pages = []
    for page_num in range(page_offset, num_pages):
        pages.append(pdf_reader.pages[page_num].extract_text())

    return pages





In [None]:
df = pd.read_csv("RAW_recipes.csv")

In [None]:
recipes = []
for index, row in df.iterrows():
    nutrition = eval(row["nutrition"])
    nutrition_facts = f"Nutritional facts:\n#Calories: {nutrition[0]}\n Total fat: {nutrition[1]}\n Sugar: {nutrition[2]}\n Sodium: {nutrition[3]}\n Protein: {nutrition[4]}\n Saturated fat: {nutrition[5]}"
    name = f"Name: {row['name']}"
    description = f"Description:\n {row['description']}"
    ingredients = "Ingredients:\n" + "\n".join(eval(row["ingredients"]))
    steps = "Steps:\n" + "\n".join(eval(row["steps"]))
    
    text =  f"{name} \n{description} \n{nutrition_facts} \n{ingredients} \n{steps}"
    recipes.append(text)

In [None]:
recipes[0]

In [None]:
load_dotenv()

api_key = os.getenv('GEMINI_API_KEY')
genai.configure(api_key=api_key)

#local_embedding_model = "nomic-embed-text"
local_embedding_model = "bge-m3"

gemini_model = genai.GenerativeModel()

In [None]:
def get_chroma_db_without_embedding_function(name):
    chroma_client = chromadb.PersistentClient(path="database/")
    return chroma_client.get_or_create_collection(name=name)

def delete_chroma_db(name):
    chroma_client = chromadb.PersistentClient(path="database/")
    chroma_client.delete_collection(name=name)

def add_docs_to_db(docs, ids, embeds, db):
    num_of_docs = len(docs)
    j=0
    while num_of_docs > 0:
        num = min(num_of_docs, 5000)
        db.add(
            documents = docs[j*5000:(j*5000)+num],
            ids = ids[j*5000:(j*5000)+num],
            embeddings=embeds[j*5000:(j*5000)+num])
        j += 1
        num_of_docs -= num

def generate_blocs_db_items(initial_size, documents, embedding_model):
    docs = []
    ids = []
    embeds = []

    index = initial_size
    for i, d in tqdm(enumerate(documents), total=len(documents), desc="Creating DB items"):
        docs.append(d)
        ids.append(str(index))
        embeds.append(embeddings(
            model = embedding_model, 
            prompt=d).embedding)
        index += 1
    return docs, ids, embeds, index

def create_chroma_db(documents, embedding_model, db_name):
    
    chroma_client = chromadb.PersistentClient(path="database/")

    db = chroma_client.get_or_create_collection(name=db_name)

    initial_size = db.count()

    index = initial_size

    docs, ids, embeds, index = generate_blocs_db_items(index, documents, embedding_model)
    add_docs_to_db(docs, ids, embeds, db)
        


In [None]:
delete_chroma_db("recipes")
create_chroma_db(recipes[:1000], local_embedding_model, "recipes")
chroma_db = get_chroma_db_without_embedding_function("recipes")

In [None]:
chroma_db.count()

In [None]:
def get_relevant_passages_with_embeddings_and_articles(query, chat_model, embedding_model, db):

    embeddings_response = embeddings(
        model=embedding_model,
        prompt=query) 
    
    n_results = 10

    results = db.query(
        query_embeddings=embeddings_response.embedding,  
        n_results= n_results)
    
    return results

def convert_passages_to_list_updown(passages):
    context = ""

    a = passages["documents"][0]
    b = passages["ids"][0]

    b = list(map(str, b))
    x = [val for _, val in sorted(zip(b, a))]
    
    for passage in x:
        context += passage + "\n"
    return context

def make_prompt_legal(query, passage):

    query_oneline = query.replace("\n", "\n")

    passage_oneline = passage.replace("\n", "\n")

    #print(passage_oneline)

    # This prompt is where you can specify any guidance on tone, or what topics the model should stick to, or avoid.
    prompt = f"""You are a chatbot specialized in answering queries from users about recipes based on the CONTEXT bellow.
        Your answer must be detailed and nicely formatted.
        In your answer, highlight the ingredients contained in the user query. 
    
    QUERY: 
    {query_oneline}
    
    CONTEXT: 
    {passage_oneline}
    
    """
    
    return prompt

def answer_question_with_gemini(query, embedding_db, chat_model, embedding_model):

    passages = get_relevant_passages_with_embeddings_and_articles(query, chat_model, embedding_model, embedding_db)

    context = convert_passages_to_list_updown(passages)

    prompt = make_prompt_legal(query, context)

    answer = chat_model.generate_content(prompt)

    return answer.text

In [None]:
query= "give me a recipe with bananas, eggs, lemon, orange and flour"
Markdown(answer_question_with_gemini(query, chroma_db, gemini_model, local_embedding_model))

In [None]:
ddg_api = Duckduckgo()
#results = ddg_api.search(f"find youtube videos for a recipe similar to this one {recipes[2]}")
results = ddg_api.search(f"find youtube videos corresponding to this query: {query}")

In [None]:
for result in results['data']:
    if ("https://www.youtube.com" in result['url']):
        print(result['url'])
