In [32]:
!pip install -qU \
  datasets==2.14.6 \
  cohere==4.34

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [33]:
import os,sys
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from chunk_manager import ChunkerManager
from vector_store_manager import VectorStoreManager
from retriever_manager import RetrieverManager
from rag_evaluation import RAG
from rag_utils import extract_questions_and_groundtruth

## Dataset Download

We're going to test with a more real world use-case, with messy, imperfect data. We will use the [`jamescalam/ai-arxiv-chunked`](https://huggingface.co/datasets/jamescalam/ai-arxiv-chunked) dataset.

In [34]:
chunker_manager = ChunkerManager()

In [35]:
file_path = "./../prompts/context.txt"
character_chunks = chunker_manager.character_splitting(file_path, 500, 50)

In [36]:
character_chunks

[Document(page_content='ADVISORY SERVICES AGREEMENT\nThis Advisory Services Agreement is entered into as of June 15th, 2023 (the “Effective Date”), by and between Cloud Investments Ltd., ID 51-426526-3, an Israeli company (the "Company"), and Mr. Jack Robinson, Passport Number 780055578, residing at 1 Rabin st, Tel Aviv, Israel, Email: jackrobinson@gmail.com ("Advisor").\n\nWhereas, Advisor has expertise and/or knowledge and/or relationships, which are relevant to the Company’s business and the Company has asked Advisor t', metadata={'source': './../prompts/context.txt'}),
 Document(page_content='any’s business and the Company has asked Advisor to provide it with certain Advisory services, as described in this Agreement; and\n\nWhereas, Advisor has agreed to provide the Company with such services, subject to the terms set forth in this Agreement.\n\nNOW THEREFORE THE PARTIES AGREE AS FOLLOWS:\n\nServices:\nAdvisor shall provide to the Company, as an independent contractor, software dev

First we define our embedding function.

In [37]:
import os
from getpass import getpass
import cohere

cohere_key = os.getenv("COHERE_API_KEY") or getpass("Cohere API key: ")
co = cohere.Client(cohere_key)

def embed(docs: list[str]) -> list[list[float]]:
    docs_text = [doc.page_content for doc in docs]  # Extract text content
    doc_embeds = co.embed(
        docs_text,  # Pass the text content
        input_type="search_document",
        model="embed-english-v3.0"
    )
    return doc_embeds.embeddings

Use this to build a Numpy array of cohere embedding vectors.

In [38]:
from flask import jsonify
from tqdm.auto import tqdm
import numpy as np

chunks = character_chunks
print(chunks)
batch_size = 128

for i in tqdm(range(0, len(chunks), batch_size)):
    i_end = min(len(chunks), i+batch_size)
    chunk_batch = chunks[i:i_end]
    # embed current batch
    embed_batch = embed(chunk_batch)
    # add to existing np array if exists (otherwise create)
    if i == 0:
        arr = np.array(embed_batch)
    else:
        arr = np.concatenate([arr, np.array(embed_batch)])

[Document(page_content='ADVISORY SERVICES AGREEMENT\nThis Advisory Services Agreement is entered into as of June 15th, 2023 (the “Effective Date”), by and between Cloud Investments Ltd., ID 51-426526-3, an Israeli company (the "Company"), and Mr. Jack Robinson, Passport Number 780055578, residing at 1 Rabin st, Tel Aviv, Israel, Email: jackrobinson@gmail.com ("Advisor").\n\nWhereas, Advisor has expertise and/or knowledge and/or relationships, which are relevant to the Company’s business and the Company has asked Advisor t', metadata={'source': './../prompts/context.txt'}), Document(page_content='any’s business and the Company has asked Advisor to provide it with certain Advisory services, as described in this Agreement; and\n\nWhereas, Advisor has agreed to provide the Company with such services, subject to the terms set forth in this Agreement.\n\nNOW THEREFORE THE PARTIES AGREE AS FOLLOWS:\n\nServices:\nAdvisor shall provide to the Company, as an independent contractor, software deve

100%|██████████| 1/1 [00:03<00:00,  3.34s/it]


In [39]:
xq = co.embed(
    [" What is the termination notice?"],
    input_type="search_query",
    model="embed-english-v3.0"
).embeddings
xq = np.array(xq[0])

In [40]:
sim = np.dot(arr, xq.T)
top_k=3
idx = np.argpartition(sim, -top_k)[-top_k:]
idx

array([19,  7,  8])

Now we need to create the query mechanism, this is simply a cosine similarity calculation between a query vector and our `arr` vectors.

In [41]:
from numpy.linalg import norm

# convert chunks list to array for easy indexing
chunk_arr = np.array(chunks)

def query(text: str, top_k: int=3) -> list[str]:
    # create query embedding
    xq = co.embed(
        [text],
        input_type="search_query",
        model="embed-english-v3.0"
    ).embeddings
    xq = np.array(xq[0])
    # calculate cosine similarities
    sim = np.dot(arr, xq.T)
    print(sim.shape)
    # get indices of top_k records
    idx = np.argpartition(sim, -top_k)[-top_k:]
    print(sim[idx])
    # get docs and print
    docs = chunk_arr[idx]
    print(docs.shape)
    for d in docs.tolist():
        print(d)
        print("----------")

In [44]:
query(" What is the termination notice?")

(32,)
[0.28626472 0.35888431 0.3893717 ]
(3,)
page_content='from or relating to this Agreement.\nNotices: Notices under this Agreement shall be delivered to the party’s email address as follows: Company: info@cloudcorp.com, Advisor: jackrobinson@gmail.com, or in any the other means with a proof of acceptance by the other party.\nIN WITNESS WHEREOF the parties have executed this Agreement as of the date first above written.\n\nCloud Investments Ltd. Advisor\n\nBy: ________________________ By:________________________\n\nName: Silvan Joseph Name: Jack Robinson\n\nTit' metadata={'source': './../prompts/context.txt'}
----------
page_content='g reasonable attorneys\' fees in connection with any breach by Advisor, of any obligations pursuant to a prior or existing engagement with any other third party, including without limitation other employers or clients.\nTerm: The term of this Agreement shall commence on the Effective Date and shall continue until terminated in accordance with the provis

In [43]:
query(" Who are the parties to the Agreement and what are their defined names?")

(32,)
[0.37819558 0.38847997 0.42007298]
(3,)
page_content='er with the Exhibits, which are attached hereto and incorporated herein, set forth the entire Agreement between the parties and shall supersede all previous communications and agreements between the parties, either oral or written. This Agreement may be modified only by a written amendment executed by both parties. This Agreement may not be assigned, sold, delegated or transferred in any manner by Advisor for any reason whatsoever. The Company may assign the Agreement to a successor of all or su' metadata={'source': './../prompts/context.txt'}
----------
page_content='ance or social benefits payable thereto, and marketing costs incurred in connection with the performance of obligations hereunder.\nConfidentiality, Non-Competition and IP Ownership Undertaking: In connection with the performance of Advisor’s obligations under this Agreement, the Advisor shall execute a Confidentiality, Non-Competition and IP Ownership Undertakin

---