In [1]:
# !pip install PyPDF2

In [2]:
# !pip install spacy

In [3]:
# !python3 -m spacy download en_core_web_sm


In [4]:
from examples.example_config import SECRET_KEY

In [5]:
import pandas as pd
import PyPDF2

In [6]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [7]:
with open('knowledge_book.pdf', 'rb') as pdf_file:
    # Create a PDF reader object
    pdf_reader = PyPDF2.PdfReader(pdf_file)

    # Get the number of pages in the PDF document
    num_pages = len(pdf_reader.pages)

    # Create an empty list to store the paragraphs
    paragraphs = []
    print(num_pages)
    # Loop over each page and extract the text
    for i in range(num_pages):
        # Get the page object
        page = pdf_reader.pages[i]

        # Extract the text from the page
        text = page.extract_text()

        # Split the text into sentences using spaCy
        doc = nlp(text)
        sentences = [sent.text for sent in doc.sents]

        # Merge adjacent sentences into paragraphs
        paragraph = ''
        for sentence in sentences:
            if len(paragraph) + len(sentence) > 500:
                paragraphs.append(paragraph)
                paragraph = sentence
            else:
                paragraph += ' ' + sentence
        paragraphs.append(paragraph)
    # Create a DataFrame with the paragraphs
    df = pd.DataFrame({'content': paragraphs})

20


In [8]:

# Open the PDF file in read binary mode
with open('cookbook.pdf', 'rb') as pdf_file:
    # Create a PDF reader object
    pdf_reader = PyPDF2.PdfReader(pdf_file)

    # Create an empty dataframe to store the text from each page
    df_cook = pd.DataFrame(columns=['page_number', 'content'])

    # Loop through each page of the PDF
    for page_num in range(len(pdf_reader.pages)):
        # Get the text of the current page
        page = pdf_reader.pages[page_num]
        text = page.extract_text()

        # Append the text to the dataframe
        df_cook = df_cook.append({'page_number': page_num + 1, 'content': text}, ignore_index=True)

In [9]:
# df.head(15)

In [10]:
df_cook.tail()

Unnamed: 0,page_number,content
91,92,Easy\tBroccoli\tSoup\nIngredients\n\t\n1\t-2\t...
92,93,approximately\t¼\thour.\nAdjust\tseasoning.\nY...
93,94,Golden\tLentil\tSoup\nIngredients\n\t\n1\tcup\...
94,95,Zucchini\tand\tBasil\tSoup\nIngredients\n\t\n2...
95,96,Blend\tin\tbasil.\tBlend\tin\tgarlic.\tallow\t...


In [11]:
# !pip install tiktoken

In [12]:
import numpy as np
import openai
import tiktoken
import time

In [13]:
COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

In [14]:
openai.api_key = SECRET_KEY

In [15]:
def get_embedding(text: str, model: str = EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
        model=model,
        input=text
    )
    time.sleep(5)
    return result["data"][0]["embedding"]


def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.content) for idx, r in df.iterrows()
    }

In [16]:
def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]:
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """

    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
        (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

In [17]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.preprocessing import normalize

sentence_tf_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [18]:
def get_hf_embeddings(text: str, model) -> np.ndarray:
    sentence_embeddings = model.encode(text)
    sentence_embeddings = sentence_embeddings.reshape(1, -1)
    sentence_embeddings = normalize(sentence_embeddings)
    return sentence_embeddings[0]


def compute_doc_embeddings_hf(df: pd.DataFrame, model) -> dict[tuple[str, str], np.ndarray]:
    return {
        idx: get_hf_embeddings(r.content, model) for idx, r in df.iterrows()
    }


In [19]:
cook_embeddings_hf = compute_doc_embeddings_hf(df_cook, sentence_tf_model)

In [20]:
cook_embeddings_hf

{0: array([ 9.20526870e-03,  5.01278862e-02,  7.70692825e-02, -3.52503918e-02,
        -7.25457445e-02, -3.26514356e-02, -1.28457965e-02, -5.06264642e-02,
        -7.04279309e-03, -2.31342148e-02, -2.39928458e-02, -5.01954965e-02,
        -5.28357327e-02, -8.59963894e-02, -2.05070595e-03, -2.93122102e-02,
         5.15558608e-02, -1.62239671e-02,  2.41569486e-02, -2.01486554e-02,
        -2.07680054e-02, -7.76688978e-02,  1.89338196e-02,  2.11249068e-02,
         3.83102335e-02,  1.92841906e-02,  2.04933602e-02, -1.72151811e-02,
        -1.07565857e-01,  1.59651972e-02, -1.40538514e-02, -4.76436689e-03,
         6.95836768e-02,  1.03460262e-02, -4.26526591e-02,  4.67937775e-02,
         1.56807765e-01,  1.86432935e-02,  8.04452691e-03,  8.55341833e-03,
         8.64247698e-03,  6.18163403e-03,  3.86104025e-02, -7.11416975e-02,
         7.51693696e-02,  1.02930248e-01,  1.02828396e-02, -1.02090025e-02,
        -4.50131530e-03,  4.23712954e-02, -4.16087657e-02,  7.51390914e-03,
         

In [21]:
# document_embeddings
cook_embeddings

NameError: name 'cook_embeddings' is not defined

In [None]:
def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))


def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[
    (float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_hf_embeddings(query, sentence_tf_model)

    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)

    return document_similarities

In [None]:
order_document_sections_by_query_similarity("How to cook a golden lentil soup?", cook_embeddings_hf)[:5]



[(0.8725757, 93),
 (0.70219666, 21),
 (0.5159754, 91),
 (0.49693555, 82),
 (0.48673898, 32)]

In [None]:
MAX_SECTION_LEN = 500
SEPARATOR = "\n* "
ENCODING = "gpt2"  # encoding for text-davinci-003

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

In [None]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)

    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []

    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        document_tokens = len(encoding.encode(document_section.content))
        chosen_sections_len += document_tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break

        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))

    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))

    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""

    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [None]:
prompt = construct_prompt(
    "How to cook a golden lentil soup?",
    cook_embeddings_hf,
    df_cook
)

print("===\n", prompt)

Selected 1 document sections:
93
===
 Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."

Context:

* Golden	Lentil	Soup Ingredients 	 1	cup	red	lentil 6	cups	water 4	chicken	stock	cubes 4	carrots,	Chopped 1	onion,	Chopped 2	garlic	cloves,	Minced 2	teaspoons	curry	powder 	 cayenne	(	To	Taste)	(optional) 	 Directions Heat	all	of	the	items	to	boiling	in	pot. Then	lower	the	temperature. Allow	to	simmer	for	½	hour	or	till	lentils	become	soft. After	this,	puree	them	till	become	smooth. 	

 Q: How to cook a golden lentil soup?
 A:


In [None]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 1000,
    "model": COMPLETIONS_MODEL,
}

In [55]:
def answer_query_with_context(
        query: str,
        df: pd.DataFrame,
        document_embeddings: dict[(str, str), np.array],
        show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )

    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
        prompt=prompt,
        **COMPLETIONS_API_PARAMS
    )

    return response["choices"][0]["text"].strip(" \n")

In [61]:
answer_query_with_context("How to cook a golden lentil soup? Explain the whole process from the ingredients to serve.",
                          df_cook, cook_embeddings_hf)


Selected 1 document sections:
93


'To cook a golden lentil soup, you will need 1 cup of red lentil, 6 cups of water, 4 chicken stock cubes, 4 carrots chopped, 1 onion chopped, 2 garlic cloves minced, 2 teaspoons of curry powder, and cayenne to taste (optional). Heat all of the items to boiling in a pot. Then lower the temperature and allow to simmer for ½ hour or until the lentils become soft. After this, puree them until they become smooth. Serve the soup when it is ready.'

In [38]:

manuel_res = openai.Completion.create(
    prompt="How to cook a golden lentil soup? Explain the whole process from the ingredients to serve.",
    **COMPLETIONS_API_PARAMS
)

manuel_res["choices"][0]["text"].strip(" \n")

'Ingredients:\n\n-1 tablespoon olive oil\n-1 onion, diced\n-2 cloves garlic, minced\n-1 teaspoon ground cumin\n-1 teaspoon ground coriander\n-1 teaspoon ground turmeric\n-1/2 teaspoon ground ginger\n-1/4 teaspoon ground cinnamon\n-1/4 teaspoon ground cardamom\n-1/4 teaspoon ground black pepper\n-1/4 teaspoon cayenne pepper\n-1 cup golden lentils, rinsed\n-4 cups vegetable broth\n-1 can (14.5 ounces) diced tomatoes\n-1/2 cup coconut milk\n-1/4 cup chopped fresh cilantro\n-Salt and pepper, to taste\n\nInstructions:\n\n1. Heat the olive oil in a large pot over medium heat. Add the onion and garlic and cook until softened, about 5 minutes.\n\n2. Add the cumin, coriander, turmeric, ginger, cinnamon, cardamom, black pepper, and cayenne pepper and cook for 1 minute, stirring constantly.\n\n3. Add the lentils and vegetable broth and bring to a boil. Reduce the heat to low and simmer for 20 minutes, stirring occasionally.\n\n4. Add the diced tomatoes and coconut milk and simmer for an additiona