In [1]:
from haystack import Document
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever, AnswerParser, PromptNode, PromptTemplate
from haystack import Pipeline
import os
import pandas as pd

In [2]:
from dotenv import load_dotenv
load_dotenv("../.env")
openai_key = os.getenv("OPENAI_API_KEY")

In [15]:
openai_key=''

In [3]:

def initialize_documents(file_path):
    """
    Casts recipes from prepared recipe_docs.csv file into document structure for Haystack.

    Args:
        file_path (str): location of recipe_docs.csv file
    Returns:
        documents ()
    """
    # Load data
    df = pd.read_csv(file_path)
    df= df[:100]

    if len(df.columns) != 2 or df.columns[0] != "name" or df.columns[1] != "full_recipe":
            raise ValueError("The CSV must contain two columns named 'name' and 'full_recipe'")
    
    df.fillna(value="", inplace=True)
    df["name"] = df["name"].apply(lambda x: x.strip())

    df = df.rename(columns={"name": "content"})
    df = df.rename(columns={"full_recipe": "answer"})
    docs_dicts = df.to_dict(orient="records")

    docs = []
    for dictionary in docs_dicts:            
        docs.append(Document.from_dict(dictionary))
    print(len(docs))
    return docs


In [4]:
def initialize_faiss_document_store(documents):
    """
    Initialize FAISS document store and retriever.
    Args:
        documents (list): List of documents to be stored in document store.
    Returns:
        document_store (FAISSDocumentStore): FAISS document store.
        retriever (DensePassageRetriever): Dense passage retriever
    """
    # Initialize DocumentStore
    document_store = FAISSDocumentStore(faiss_index_factory_str='Flat', return_embedding=True)

    # Initialize Retriever
    retriever = DensePassageRetriever(
        document_store=document_store,
        query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
        passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
        use_gpu=False,
        embed_title=True
        )

    # Delete existing documents in document store
    document_store.delete_documents()
    document_store.write_documents(documents)

    # Add documents embeddings to index
    document_store.update_embeddings(retriever=retriever)

    return document_store, retriever

In [5]:
documents = initialize_documents('data/recipe_docs.csv')


100


In [6]:
len(documents)

100

In [7]:
documents[0]

<Document: {'content': 'arriba   baked winter squash mexican style', 'content_type': 'text', 'score': None, 'meta': {'answer': 'Name: arriba   baked winter squash mexican style\n\nRating: 5.0/5\n\nCook Time: 55 minutes\n\nTags: 60-minutes-or-less, time-to-make, course, main-ingredient, cuisine, preparation, occasion, north-american, side-dishes, vegetables, mexican, easy, fall, holiday-event, vegetarian, winter, dietary, christmas, seasonal, squash\n\nDescription: autumn is my favorite time of year to cook! this recipe \r\ncan be prepared either spicy or sweet, your choice!\r\ntwo of my posted mexican-inspired seasoning mix recipes are offered as suggestions.\n\nNumber of ingredients: 7\n\nIngredients List: winter squash, mexican seasoning, mixed spice, honey, butter, olive oil, salt\n\nSteps:\n1 make a choice and proceed with recipe\n2 depending on size of squash , cut into half or fourths\n3 remove seeds\n4 for spicy squash , drizzle olive oil or melted butter over each cut squash pi

In [8]:
document_store, retriever = initialize_faiss_document_store(documents=documents)


  return self.fget.__get__(instance, owner)()
Writing Documents: 10000it [00:00, 76897.06it/s]          
Documents Processed: 10000 docs [00:18, 550.65 docs/s]         


In [16]:
def initialize_rag_pipeline(retriever, openai_key):
    """
    Initialize a pipeline for RAG-based chatbot.
    Args:
        retriever (DensePassageRetriever): Dense passage retriever.
        openai_key (str): API key for OpenAI.
    Returns:
        query_pipeline (Pipeline): Pipeline for RAG-based question answering.
    """
    prompt_template = PromptTemplate(prompt=""""Answer the following query based on the provided context. If the context does
                                                not include an answer, reply with 'The data does not contain information related to the question'.\n
                                                Query: {query}\n
                                                Documents: {join(documents)}
                                                Answer: 
                                            """,
                                            output_parser=AnswerParser())
    prompt_node = PromptNode(model_name_or_path="gpt-3.5-turbo",
                             api_key=openai_key,
                             default_prompt_template=prompt_template,
                             max_length=500,
                             model_kwargs={"stream": True})

    query_pipeline = Pipeline()
    query_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
    query_pipeline.add_node(component=prompt_node, name="PromptNode", inputs=["Retriever"])

    return query_pipeline

In [17]:
query_pipeline = initialize_rag_pipeline(retriever=retriever, openai_key=openai_key)


In [18]:
query_pipeline.run("eggs,bacon,cheese")

The data does not contain information related to the question.

{'answers': [<Answer {'answer': 'The data does not contain information related to the question.', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_ids': ['92768dadbf7a0eeb3ec3e6303e3813f9', '123187ef16b64ddf0ef67bd48646cce1', '77235b76387b398c752a138c4e19761a', 'b2e9b71d88d11392fecce097a0f4a40b', 'ac58779334049a1e1d0f8cedcd7bf554', '7485683ce5c951f9b719d0830b1c0dbd', '4f9a280b1d3535d46895c898103a2986', '5813795ae104aa9539b7e69e84636e32', '87ec786e108ba1959adff0271aaa0101', '2a6e80a598f794936eff38f88bcb584d'], 'meta': {'prompt': "Answer the following query based on the provided context. If the context does\n                                                not include an answer, reply with 'The data does not contain information related to the question'.\n\n                                                Query: eggs,bacon,cheese\n\n                                                Documents: crispy crunchy  chicken stove