In [1]:
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter,SentenceTransformersTokenTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
import chromadb 
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import os
import shutil
from langchain_community.document_loaders import Docx2txtLoader

from pypdf import PdfReader

import os
import openai
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv


In [2]:

import sys
sys.path.append('../')
from src.utils import word_wrap

In [3]:
import os
import openai
from getpass import getpass

openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

In [4]:

import umap
import numpy as np
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
vectordb_path = '../data/chromadb/'
data_path = '../data/contract_data/'

In [6]:
reader = PdfReader(data_path+'Raptor.pdf')

In [7]:
pdf_texts = [p.extract_text().strip() for p in reader.pages]
pdf_texts = [text for text in pdf_texts if text]

In [8]:
print(word_wrap(pdf_texts[0]))

[R&G Draft 12.__.2021] STOCK PURCHASE AGREEMENT BY AND AMONG [BUYER],
[TARGET COMP ANY], THE SELLERS LISTED ON SCHEDULE I HERET O AND THE
SELLERS’ REPRESENT ATIVE NAMED HEREIN Dated as of [●] [This document
is intended solely to facilitate discussions among the parties
identified herein.  Neither this document nor such discussions are
intended to create, nor will either or both be  deemed to create, a
legally binding or enforceable offer or agreement of any type or
nature,  unless and until a definitive written agreement is executed
and delivered by each of the parties  hereto. This document shall be
kept confidential pursuant to the terms of the Confidentiality
Agreement entered into by the parties and, if applicable, its
affiliates with respect to the subject  matter hereof.] 112923184_5


In [9]:
character_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n","\n",".", " ",""],
        chunk_size=500,
        chunk_overlap=10,
    )
character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

In [10]:
print('Total chunk',len(character_split_texts))

Total chunk 490


In [11]:
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

In [12]:
token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

print(word_wrap(token_split_texts[10]))
print(f"\nTotal chunks: {len(token_split_texts)}")

stock purchase agreement this stock purchase agreement ( this “
agreement ” ) is made and entered into as of [ ● ] by and among [
parent ], a [ ● ] corporation ( “ parent ” ) ], [ buyer ], a delaware
corporation and a wholly owned subsidiary of parent ( “ buyer ” ) 1, [
target comp any ], a delaware corporation ( the “ company ” ), each of
the holders of outstanding shares of capital stock of the company
listed on schedule i hereto ( respectively, the “ shareholders ” or
the “ sellers ” ) 2, and [ ● ], in

Total chunks: 490


In [13]:
embedding_function = SentenceTransformerEmbeddingFunction()
print(embedding_function([token_split_texts[10]]))

[[-0.010687097907066345, -0.02267240546643734, 0.058891016989946365, -0.026844708248972893, -0.016043977811932564, 0.007356878370046616, 0.03913184255361557, 0.02374662272632122, 0.04208064079284668, -0.04585960507392883, 0.07091499865055084, -0.020057646557688713, 0.007743957918137312, -0.020984595641493797, 0.061448659747838974, 0.007069782819598913, 0.00023805622186046094, 0.025507736951112747, 0.04417411983013153, -0.008094425313174725, -0.08181234449148178, -0.018982628360390663, 0.001050553168170154, 0.031059488654136658, 0.026162974536418915, 0.007653172127902508, 0.020977508276700974, 0.033604495227336884, 0.017255356535315514, -0.03667173162102699, -0.033191192895174026, -0.017380716279149055, 0.08637655526399612, 0.04638750106096268, -0.008988943882286549, 0.03413565829396248, 0.01322031207382679, -0.06021418794989586, 0.01641496829688549, -0.001064987969584763, 0.007121656555682421, 0.024552837014198303, 0.014552944339811802, -0.026268137618899345, 0.006666858214884996, 0.03

In [14]:
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection(name='contract', embedding_function=embedding_function)

In [15]:
ids = [str(i) for i in range(len(token_split_texts))]

chroma_collection.add(ids=ids, documents=token_split_texts)
chroma_collection.count()

490

In [16]:
query = "Is escrow amount greate than the Retention Amount ?"

results = chroma_collection.query(query_texts=[query], n_results=5)
retrieved_documents = results['documents'][0]

for document in retrieved_documents:
    print(word_wrap(document))
    print('\n')

gains of such person ( including stock appreciation, phantom stock,
profit participation or other similar rights ). “ escrow amount ”
means, $ 1, 000, 000 “ facilities ” means any buildings, plants,
improvements or structures located on the real property. “ family
member ” means, with respect to any individual, ( a ) such person ’ s
spouse, ( b ) each parent, brother, sister or child of such person or
such person ’ s spouse, ( c ) the spouse of any person described in
clause ( b ) above, ( d ) each child


in accordance with the escrow agreement. the escrow amount shall be
held and, subject to section 2. 07, released to the company
securityholders in accordance with the provisions of the escrow
agreement with the company securityholders being entitled to share in
such released amounts in accordance with their pro rata percentages.
from and after the closing, buyer and the sellers ’ representative
will direct the escrow agent to disburse payments from the escrow
account in accordance wi

In [17]:
openai_client = OpenAI()

In [18]:
def rag(query, retrieved_documents, model="gpt-3.5-turbo"):
    information = "\n\n".join(retrieved_documents)

    messages = [
        {
            "role": "system",
            "content": "You are a helpful expert financial research assistant. Your users are asking questions about information contained in an annual report."
            "You will be shown the user's question, and the relevant information from the annual report. Answer the user's question using only this information."
        },
        {"role": "user", "content": f"Question: {query}. \n Information: {information}"}
    ]
    
    response = openai_client.chat.completions.create(
        model=model,
        messages=messages,
    )
    content = response.choices[0].message.content
    return content

In [19]:
output = rag(query=query, retrieved_documents=retrieved_documents)

print(word_wrap(output))

Based on the information provided, it is not explicitly stated whether
the escrow amount is greater than the Retention Amount. The annual
report information focuses on the details and procedures related to
the escrow amount, including how it is held, released, and distributed
among the company securityholders. The Retention Amount is not
mentioned in the provided information.


In [20]:
embeddings = chroma_collection.get(include=['embeddings'])['embeddings']
umap_transform = umap.UMAP(random_state=0, transform_seed=0).fit(embeddings)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [21]:
def project_embeddings(embeddings, umap_transform):
    umap_embeddings = np.empty((len(embeddings),2))
    for i, embedding in enumerate(tqdm(embeddings)): 
        umap_embeddings[i] = umap_transform.transform([embedding])
    return umap_embeddings

In [22]:
projected_dataset_embeddings = project_embeddings(embeddings, umap_transform)

100%|██████████| 490/490 [04:27<00:00,  1.83it/s]


In [24]:
import plotly.express as px
from umap import UMAP

# Load the Iris dataset


# Create a UMAP model and fit it to the data
umap_2d = UMAP(n_components=2, random_state=0)


# Visualize the UMAP projections
fig = px.scatter(projected_dataset_embeddings[:, 0], projected_dataset_embeddings[:, 1])
fig.show()


In [25]:
query = "How much is the escrow amount?"

results = chroma_collection.query(query_texts=query, n_results=5, include=['documents', 'embeddings'])

retrieved_documents = results['documents'][0]

for document in results['documents'][0]:
    print(word_wrap(document))
    print('')

gains of such person ( including stock appreciation, phantom stock,
profit participation or other similar rights ). “ escrow amount ”
means, $ 1, 000, 000 “ facilities ” means any buildings, plants,
improvements or structures located on the real property. “ family
member ” means, with respect to any individual, ( a ) such person ’ s
spouse, ( b ) each parent, brother, sister or child of such person or
such person ’ s spouse, ( c ) the spouse of any person described in
clause ( b ) above, ( d ) each child

in accordance with the escrow agreement. the escrow amount shall be
held and, subject to section 2. 07, released to the company
securityholders in accordance with the provisions of the escrow
agreement with the company securityholders being entitled to share in
such released amounts in accordance with their pro rata percentages.
from and after the closing, buyer and the sellers ’ representative
will direct the escrow agent to disburse payments from the escrow
account in accordance wit

In [26]:
query_embedding = embedding_function([query])[0]
retrieved_embeddings = results['embeddings'][0]

projected_query_embedding = project_embeddings([query_embedding], umap_transform)
projected_retrieved_embeddings = project_embeddings(retrieved_embeddings, umap_transform)

100%|██████████| 1/1 [00:00<00:00,  1.35it/s]
100%|██████████| 5/5 [00:02<00:00,  1.68it/s]


In [27]:
# Plot the projected query and retrieved documents in the embedding space
import plotly.graph_objects as go
trace1 = go.Scatter(x=projected_dataset_embeddings[:, 0], y=projected_dataset_embeddings[:, 1],mode="markers")
trace2 = go.Scatter(x=projected_query_embedding[:, 0], y=projected_query_embedding[:, 1],mode="markers")
trace3 = go.Scatter(x=projected_retrieved_embeddings[:, 0], y=projected_retrieved_embeddings[:, 1],mode="markers")

# Create a figure and add the traces
fig = go.Figure(data=[trace1, trace2,trace3])
fig.show()

In [28]:
from langchain.text_splitter import NLTKTextSplitter, SpacyTextSplitter

In [29]:
spliter = NLTKTextSplitter()

In [30]:
character_split_texts = spliter.split_text(pdf_texts[0])

In [31]:
character_split_texts

['[R&G\nDraft\n12.__.2021]\nSTOCK\nPURCHASE\nAGREEMENT\nBY\nAND\nAMONG\n[BUYER],\n[TARGET\nCOMP ANY],\nTHE\nSELLERS\nLISTED\nON\nSCHEDULE\nI\nHERET O\nAND\nTHE\nSELLERS’\nREPRESENT ATIVE\nNAMED\nHEREIN\nDated\nas\nof\n[●]\n[This\ndocument\nis\nintended\nsolely\nto\nfacilitate\ndiscussions\namong\nthe\nparties\nidentified\nherein.\n\nNeither\nthis\ndocument\nnor\nsuch\ndiscussions\nare\nintended\nto\ncreate,\nnor\nwill\neither\nor\nboth\nbe \ndeemed\nto\ncreate,\na\nlegally\nbinding\nor\nenforceable\noffer\nor\nagreement\nof\nany\ntype\nor\nnature, \nunless\nand\nuntil\na\ndefinitive\nwritten\nagreement\nis\nexecuted\nand\ndelivered\nby\neach\nof\nthe\nparties \nhereto.\n\nThis\ndocument\nshall\nbe\nkept\nconfidential\npursuant\nto\nthe\nterms\nof\nthe\nConfidentiality \nAgreement\nentered\ninto\nby\nthe\nparties\nand,\nif\napplicable,\nits\naffiliates\nwith\nrespect\nto\nthe\nsubject \nmatter\nhereof.]\n\n112923184_5']

In [38]:
from langchain.vectorstores import Chroma
#from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings



In [35]:
from langchain.document_loaders import UnstructuredWordDocumentLoader, UnstructuredFileLoader

In [36]:
docx_loader = DirectoryLoader('../data/contract_data/', glob="./*.docx", loader_cls=UnstructuredWordDocumentLoader)
document = docx_loader.load()

In [57]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

docs = text_splitter.split_documents(document)

vectorstore = Chroma.from_documents(docs, OpenAIEmbeddings())

In [58]:
len(docs)

308

In [59]:
print(max([len(chunk.page_content) for chunk in docs]))

999


In [60]:
base_retriever = vectorstore.as_retriever(search_kwargs={"k" : 2})

In [61]:
relevant_docs = base_retriever.get_relevant_documents("How much is the escrow amount?")

In [62]:
len(relevant_docs)

2

In [63]:
from langchain.prompts import ChatPromptTemplate

template = """You are a legal expert tasked with acting as the best lawyer and contract analyzer. Your task is to thoroughly understand the provided context and answer questions related to legal matters, contracts, and relevant laws. You must provide accurate responses based solely on the information provided in the context. If the necessary information is not present in the context, respond with "I don't know.":

### CONTEXT
{context}

### QUESTION
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [64]:
from operator import itemgetter

from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

primary_qa_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | base_retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

In [65]:
question = "How much is the escrow amount?"

result = retrieval_augmented_qa_chain.invoke({"question" : question})

print(result)

{'response': AIMessage(content="I don't know."), 'context': [Document(page_content='Escrow.', metadata={'source': '../data/contract_data/Raptor_Contract.docx'}), Document(page_content='Escrow.', metadata={'source': '../data/contract_data/Raptor_Contract.docx'})]}


In [66]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

question_schema = ResponseSchema(
    name="question",
    description="a question about the context."
)

question_response_schemas = [
    question_schema,
]

In [67]:
question_output_parser = StructuredOutputParser.from_response_schemas(question_response_schemas)
format_instructions = question_output_parser.get_format_instructions()

In [68]:
question_generation_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")

bare_prompt_template = "{content}"
bare_template = ChatPromptTemplate.from_template(template=bare_prompt_template)

In [69]:
from langchain.prompts import ChatPromptTemplate

qa_template = """\
You are a University Professor creating a test for advanced students. For each context, create a question that is specific to the context. Avoid creating generic or general questions.

question: a question about the context.

Format the output as JSON with the following keys:
question

context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=docs[0],
    format_instructions=format_instructions
)

question_generation_chain = bare_template | question_generation_llm

response = question_generation_chain.invoke({"content" : messages})
output_dict = question_output_parser.parse(response.content)

In [70]:
for k, v in output_dict.items():
  print(k)
  print(v)

question
What is the purpose of the document?
context
STOCK PURCHASE AGREEMENT


In [71]:
!pip install -q -U tqdm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [72]:
from tqdm import tqdm

qac_triples = []

for text in tqdm(docs[:10]):
  messages = prompt_template.format_messages(
      context=text,
      format_instructions=format_instructions
  )
  response = question_generation_chain.invoke({"content" : messages})
  try:
    output_dict = question_output_parser.parse(response.content)
  except Exception as e:
    continue
  output_dict["context"] = text
  qac_triples.append(output_dict)

100%|██████████| 10/10 [00:25<00:00,  2.55s/it]


In [73]:
qac_triples[5]

{'question': 'Who owns all of the outstanding shares of Common Stock and Preferred Stock in the Company?',
 'context': Document(page_content='WHEREAS, the Shareholders own all of the outstanding shares of Common Stock, par value $0.01 per share, of the Company (such common stock being referred to herein as the “Common Stock” and such outstanding common shares being referred to herein as the “Common Shares”) and all of the outstanding shares of Preferred Stock, par value $0.01 per share, of the Company (such preferred stock being referred to herein as the “Preferred Stock” and such outstanding preferred shares being referred to herein as the “Series A-1 Preferred Shares” or “Series A-2 Preferred Shares” and, collectively, the “Preferred Shares” and, collectively with the Common Shares, the “Shares”);\n\nWHEREAS, the holders of outstanding options and warrants to purchase capital stock of the Company (the “Optionholders” and “Warrantholders”, respectively) own all of the issued and outst

In [74]:
answer_generation_llm = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)

answer_schema = ResponseSchema(
    name="answer",
    description="an answer to the question"
)

answer_response_schemas = [
    answer_schema,
]

answer_output_parser = StructuredOutputParser.from_response_schemas(answer_response_schemas)
format_instructions = answer_output_parser.get_format_instructions()

qa_template = """\
You are a University Professor creating a test for advanced students. For each question and context, create an answer.

answer: a answer about the context.

Format the output as JSON with the following keys:
answer

question: {question}
context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=qac_triples[0]["context"],
    question=qac_triples[0]["question"],
    format_instructions=format_instructions
)

answer_generation_chain = bare_template | answer_generation_llm

response = answer_generation_chain.invoke({"content" : messages})
output_dict = answer_output_parser.parse(response.content)

In [75]:
for k, v in output_dict.items():
  print(k)
  print(v)

answer
The purpose of the document is to outline the terms and conditions of a stock purchase agreement between the buyer, the target company, the sellers listed, and the sellers' representative. It serves as a preliminary document to facilitate discussions and is not intended to create a legally binding agreement until a definitive written agreement is executed by all parties involved.
question
What is the purpose of the document?


In [76]:
for triple in tqdm(qac_triples):
  messages = prompt_template.format_messages(
      context=triple["context"],
      question=triple["question"],
      format_instructions=format_instructions
  )
  response = answer_generation_chain.invoke({"content" : messages})
  try:
    output_dict = answer_output_parser.parse(response.content)
  except Exception as e:
    continue
  triple["answer"] = output_dict["answer"]

100%|██████████| 10/10 [00:50<00:00,  5.04s/it]


In [77]:
!pip install -q -U datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [78]:
import pandas as pd
from datasets import Dataset

ground_truth_qac_set = pd.DataFrame(qac_triples)
ground_truth_qac_set["context"] = ground_truth_qac_set["context"].map(lambda x: str(x.page_content))
ground_truth_qac_set = ground_truth_qac_set.rename(columns={"answer" : "ground_truth"})


eval_dataset = Dataset.from_pandas(ground_truth_qac_set)

In [79]:
eval_dataset

Dataset({
    features: ['question', 'context', 'ground_truth'],
    num_rows: 10
})

In [80]:
eval_dataset[0]

{'question': 'What is the purpose of the document?',
 'context': '[R&G Draft 12.__.2021]\n\nSTOCK PURCHASE AGREEMENT\n\nBY AND AMONG\n\n[BUYER],\n\n[TARGET COMPANY],\n\nTHE SELLERS LISTED ON SCHEDULE I HERETO\n\nAND\n\nTHE SELLERS’ REPRESENTATIVE NAMED HEREIN\n\nDated as of [●]\n\n[This document is intended solely to facilitate discussions among the parties identified herein.  Neither this document nor such discussions are intended to create, nor will either or both be deemed to create, a legally binding or enforceable offer or agreement of any type or nature, unless and until a definitive written agreement is executed and delivered by each of the parties hereto.\n\nThis document shall be kept confidential pursuant to the terms of the Confidentiality Agreement entered into by the parties and, if applicable, its affiliates with respect to the subject matter hereof.]\n\n112923184_5\n\n112923184_5\n\n\n\nTABLE OF CONTENTS\n\nARTICLE I DEFINITIONS; CERTAIN RULES OF CONSTRUCTION\t2\n\nSecti

In [81]:
eval_dataset.to_csv("../data/ground_truth/groundtruth_eval_dataset.csv")

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 95.16ba/s]


12688

In [83]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity
)

from ragas.metrics.critique import harmfulness
from ragas import evaluate

def create_ragas_dataset(rag_pipeline, eval_dataset):
  rag_dataset = []
  for row in tqdm(eval_dataset):
    answer = rag_pipeline.invoke({"question" : row["question"]})
    rag_dataset.append(
        {"question" : row["question"],
         "answer" : answer["response"].content,
         "contexts" : [context.page_content for context in answer["context"]],
         "ground_truths" : [row["ground_truth"]]
         }
    )
  rag_df = pd.DataFrame(rag_dataset)
  rag_eval_dataset = Dataset.from_pandas(rag_df)
  return rag_eval_dataset

def evaluate_ragas_dataset(ragas_dataset):
  result = evaluate(
    ragas_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        context_relevancy,
        answer_correctness,
        answer_similarity
    ],
  )
  return result

In [84]:
from tqdm import tqdm
import pandas as pd

basic_qa_ragas_dataset = create_ragas_dataset(retrieval_augmented_qa_chain, eval_dataset)

100%|██████████| 10/10 [00:16<00:00,  1.65s/it]


In [85]:
basic_qa_ragas_dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truths'],
    num_rows: 10
})

In [86]:
basic_qa_ragas_dataset[0]

{'question': 'What is the purpose of the document?',
 'answer': 'Answer: The purpose of the document is solely to facilitate discussions among the parties identified herein. It is not intended to create a legally binding or enforceable offer or agreement unless a definitive written agreement is executed.',
 'contexts': ['[This document is intended solely to facilitate discussions among the parties identified herein.  Neither this document nor such discussions are intended to create, nor will either or both be deemed to create, a legally binding or enforceable offer',
  'this document nor such discussions are intended to create, nor will either or both be deemed to create, a legally binding or enforceable offer or agreement of any type or nature, unless and until a definitive written agreement is executed and'],
 'ground_truths': ["The purpose of the document is to outline the terms and conditions of a stock purchase agreement between the buyer, the target company, the sellers listed, a

In [87]:
basic_qa_result = evaluate_ragas_dataset(basic_qa_ragas_dataset)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 70/70 [00:16<00:00,  4.30it/s]


In [90]:
basic_qa_result

{'context_precision': 0.8500, 'faithfulness': 0.9583, 'answer_relevancy': 0.6987, 'context_recall': 0.8000, 'context_relevancy': 0.3243, 'answer_correctness': 0.6172, 'answer_similarity': 0.9088}

### Testing Other Retrievers

Now we can test our how changing our Retriever impacts our RAGAS evaluation!

We'll build this simple qa_chain factory to create standardized qa_chains where the only different component will be the retriever.

In [91]:
def create_qa_chain(retriever):
  primary_qa_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
  created_qa_chain = (
    {"context": itemgetter("question") | retriever,
     "question": itemgetter("question")
    }
    | RunnablePassthrough.assign(
        context=itemgetter("context")
      )
    | {
         "response": prompt | primary_qa_llm,
         "context": itemgetter("context"),
      }
  )

  return created_qa_chain

#### Parent Document Retriever

One of the easier ways we can imagine improving a retriever is to embed our documents into small chunks, and then retrieve a significant amount of additional context that "surrounds" the found context.

You can read more about this method [here](https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever)!

The basic outline of this retrieval method is as follows:

1. Obtain User Question
2. Retrieve child documents using Dense Vector Retrieval
3. Merge the child documents based on their parents. If they have the same parents - they become merged.
4. Replace the child documents with their respective parent documents from an in-memory-store.
5. Use the parent documents to augment generation.

In [92]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1500)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=200)

vectorstore = Chroma(collection_name="split_parents", embedding_function=OpenAIEmbeddings())

store = InMemoryStore()

In [93]:
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [95]:
parent_document_retriever.add_documents(docs)

Let's create, test, and then evaluate our new chain!

In [96]:
parent_document_retriever_qa_chain = create_qa_chain(parent_document_retriever)

In [104]:
parent_document_retriever_qa_chain.invoke({"question" : "How much is the escrow amount?"})["response"].content

'Answer: The escrow amount is $1,000,000.'

In [105]:
pdr_qa_ragas_dataset = create_ragas_dataset(parent_document_retriever_qa_chain, eval_dataset)

100%|██████████| 10/10 [00:16<00:00,  1.69s/it]


In [106]:
pdr_qa_ragas_dataset[0]

{'question': 'What is the purpose of the document?',
 'answer': 'Answer: The purpose of the document is to facilitate discussions among the parties involved in a potential stock purchase agreement. It is stated that the document is not intended to create a legally binding or enforceable offer or agreement unless a definitive written agreement is executed and delivered by each of the parties.',
 'contexts': ['Schedule\xa03.14 lists all Employee Plans which an Acquired Company sponsors or maintains, or to which an Acquired Company contributes or is obligated to contribute for the benefit of any current or former employee, director, consultant, or other individual service provider of an Acquired Company or the beneficiaries or dependents of any such Person (each a “Company Plan”).  With respect to each Company Plan, the Company has delivered to the Buyer accurate and complete copies of each of the following:  (i)\xa0if the plan has been reduced to writing, the plan document together with 

In [107]:
pdr_qa_ragas_dataset.to_csv("../data/pdr_qa_ragas_dataset.csv")

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 284.38ba/s]


24076

In [108]:
pdr_qa_result = evaluate_ragas_dataset(pdr_qa_ragas_dataset)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 70/70 [00:16<00:00,  4.22it/s]


In [109]:
pdr_qa_result

{'context_precision': 0.8917, 'faithfulness': 0.9630, 'answer_relevancy': 0.9756, 'context_recall': 0.9333, 'context_relevancy': 0.1646, 'answer_correctness': 0.7775, 'answer_similarity': 0.9641}

#### Ensemble Retrieval

Next let's look at ensemble retrieval!

You can read more about this [here](https://python.langchain.com/docs/modules/data_connection/retrievers/ensemble)!

The basic idea is as follows:

1. Obtain User Question
2. Hit the Retriever Pair
    - Retrieve Documents with BM25 Sparse Vector Retrieval
    - Retrieve Documents with Dense Vector Retrieval Method
3. Collect and "fuse" the retrieved docs based on their weighting using the [Reciprocal Rank Fusion](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) algorithm into a single ranked list.
4. Use those documents to augment our generation.

Ensure your `weights` list - the relative weighting of each retriever - sums to 1!

In [110]:
!pip install -q -U rank_bm25

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [119]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

text_splitter = RecursiveCharacterTextSplitter(chunk_size=450, chunk_overlap=75)
b_docs = text_splitter.split_documents(docs)

bm25_retriever = BM25Retriever.from_documents(b_docs)
bm25_retriever.k = 2

embedding = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(b_docs, embedding)
chroma_retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever], weights=[0.75, 0.25])

In [120]:
ensemble_retriever_qa_chain = create_qa_chain(ensemble_retriever)

In [121]:
ensemble_retriever_qa_chain.invoke({"question" : "How much is the escrow amount?"})["response"].content

"I don't know."

In [129]:
ensemble_qa_ragas_dataset = create_ragas_dataset(ensemble_retriever_qa_chain, eval_dataset)

100%|██████████| 10/10 [00:17<00:00,  1.78s/it]


In [130]:
ensemble_qa_ragas_dataset.to_csv("../data/ensemble_qa_ragas_dataset.csv")

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 469.53ba/s]


21551

In [131]:
ensemble_qa_result = evaluate_ragas_dataset(ensemble_qa_ragas_dataset)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 70/70 [00:18<00:00,  3.80it/s]


In [132]:
ensemble_qa_result

{'context_precision': 0.6268, 'faithfulness': 0.9630, 'answer_relevancy': 0.8949, 'context_recall': 0.7333, 'context_relevancy': 0.1759, 'answer_correctness': 0.7314, 'answer_similarity': 0.9442}

In [133]:
basic_qa_result

{'context_precision': 0.8500, 'faithfulness': 0.9583, 'answer_relevancy': 0.6987, 'context_recall': 0.8000, 'context_relevancy': 0.3243, 'answer_correctness': 0.6172, 'answer_similarity': 0.9088}

In [134]:
pdr_qa_result

{'context_precision': 0.8917, 'faithfulness': 0.9630, 'answer_relevancy': 0.9756, 'context_recall': 0.9333, 'context_relevancy': 0.1646, 'answer_correctness': 0.7775, 'answer_similarity': 0.9641}

In [135]:
ensemble_qa_result

{'context_precision': 0.6268, 'faithfulness': 0.9630, 'answer_relevancy': 0.8949, 'context_recall': 0.7333, 'context_relevancy': 0.1759, 'answer_correctness': 0.7314, 'answer_similarity': 0.9442}

In [136]:
ensemble_qa_result_df = ensemble_qa_result.to_pandas()

In [137]:
ensemble_qa_result_df

Unnamed: 0,question,answer,contexts,ground_truths,ground_truth,context_precision,faithfulness,answer_relevancy,context_recall,context_relevancy,answer_correctness,answer_similarity
0,What is the purpose of the document?,Answer: The purpose of the document is to faci...,[above-named courts. Notwithstanding the prev...,[The purpose of the document is to outline the...,The purpose of the document is to outline the ...,0.477778,1.0,0.0,0.0,0.1,0.648741,0.880679
1,What does Section 2.06 of the contract cover?,Answer: Section 2.06 of the contract covers th...,[ARTICLE II PURCHASE AND SALE OF SHARES AND WA...,[Section 2.06 of the contract covers the 'Trea...,Section 2.06 of the contract covers the 'Treat...,0.916667,1.0,1.0,1.0,0.045455,0.608355,0.9335
2,What is discussed in Section 3.14 of the contr...,Answer: Section 3.14 of the contract discusses...,[subject to tax pursuant to Section 3(i) of th...,[Section 3.14 of the contract discusses 'Emplo...,Section 3.14 of the contract discusses 'Employ...,0.583333,1.0,0.9858,1.0,0.0,0.996476,0.985905
3,What is the topic of Section 6.03?,Answer: The topic of Section 6.03 is related t...,[subject to tax pursuant to Section 3(i) of th...,[The topic of Section 6.03 is Publicity.],The topic of Section 6.03 is Publicity.,0.0,1.0,1.0,0.0,0.333333,0.213214,0.852857
4,Who are the parties involved in the Stock Purc...,The parties involved in the Stock Purchase Agr...,[Specific Performance. Each of the parties ac...,[The parties involved in the Stock Purchase Ag...,The parties involved in the Stock Purchase Agr...,0.805556,1.0,1.0,1.0,0.0,0.991154,0.964597
5,Who owns all of the outstanding shares of Comm...,Answer: The Shareholders own all of the outsta...,[“Fully-Diluted Common Share Number” means the...,[The Shareholders own all of the outstanding s...,The Shareholders own all of the outstanding sh...,0.679167,1.0,1.0,1.0,0.222222,0.995506,0.982025
6,What is the purpose of cancelling the Options ...,Answer: The purpose of cancelling the Options ...,"[WHEREAS, the parties further desire that all ...",[The purpose of cancelling the Options and War...,The purpose of cancelling the Options and Warr...,1.0,1.0,0.999492,0.333333,0.25,0.536165,0.944659
7,What is the meaning of 'Accounting Principles'...,Answer: The meaning of 'Accounting Principles'...,[“Vested Options Consideration” means the aggr...,"[In the context of this Agreement, 'Accounting...","In the context of this Agreement, 'Accounting ...",0.416667,1.0,0.981574,1.0,0.083333,0.74398,0.97592
8,What does the term 'Action' encompass in this ...,Answer: The term 'Action' in this context enco...,[Severability. Any term or provision of this ...,"[In the provided context, the term 'Action' re...","In the provided context, the term 'Action' ref...",0.583333,,1.0,1.0,0.6,0.742155,0.968621
9,What is the definition of 'Ancillary Agreement...,Answer: The definition of 'Ancillary Agreement...,"[Affiliates after Closing, and (b) without du...","[In the provided context, 'Ancillary Agreement...","In the provided context, 'Ancillary Agreements...",0.805556,0.666667,0.982622,1.0,0.125,0.838308,0.953231


In [138]:
def create_df_dict(pipeline_name, pipeline_items):
  df_dict = {"name" : pipeline_name}
  for name, score in pipeline_items:
    df_dict[name] = score
  return df_dict

In [139]:
basic_rag_df_dict = create_df_dict("basic_rag", basic_qa_result.items())

In [140]:
pdr_rag_df_dict = create_df_dict("pdr_rag", pdr_qa_result.items())

In [141]:
ensemble_rag_df_dict = create_df_dict("ensemble_rag", ensemble_qa_result.items())

In [142]:
results_df = pd.DataFrame([basic_rag_df_dict, pdr_rag_df_dict, ensemble_rag_df_dict])

In [143]:
results_df.sort_values("answer_correctness", ascending=False)

Unnamed: 0,name,context_precision,faithfulness,answer_relevancy,context_recall,context_relevancy,answer_correctness,answer_similarity
1,pdr_rag,0.891667,0.962963,0.975602,0.933333,0.164649,0.777451,0.96409
2,ensemble_rag,0.626806,0.962963,0.894949,0.733333,0.175934,0.731405,0.944199
0,basic_rag,0.85,0.958333,0.698677,0.8,0.324286,0.617196,0.908789


### ❓QUESTION❓

What conclusions can you draw about the above results?

Describe in your own words what the metrics are expressing.