In [1]:
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
from langchain_community.document_loaders import Docx2txtLoader
from langchain.prompts import ChatPromptTemplate,SystemMessagePromptTemplate,HumanMessagePromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from operator import itemgetter

from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser
from tqdm import tqdm
import pandas as pd
from datasets import Dataset
import os
import chromadb
import shutil
import re
from dotenv import load_dotenv
load_dotenv()


OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

import sys
sys.path.append('../')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def remove_special_characters(input_string):
    # Define a regex pattern to match the special characters
    pattern = r'[\t●\n\[\]]'
    # Use re.sub() to replace matches of the pattern with an empty string
    cleaned_string = re.sub(pattern, ' ', input_string)
    return cleaned_string

In [3]:
vectordb_path = '../data/chromadb/'
data_path = '../data/contract_data/'

In [4]:
loader = DirectoryLoader(data_path, show_progress=True)
documents = loader.load()

100%|██████████| 1/1 [00:12<00:00, 12.84s/it]


In [5]:
documents[0].page_content = remove_special_characters(documents[0].page_content)
documents[0].page_content=re.sub(r'\s+', ' ',documents[0].page_content )

In [6]:
text_spliter = RecursiveCharacterTextSplitter(
    separators=["\n\n","\n",".", " ",""],
    chunk_size=500,
    chunk_overlap=10
    )
docs = text_spliter.split_documents(documents)

In [7]:
db = Chroma.from_documents(docs, OpenAIEmbeddings(),persist_directory=vectordb_path)
db.persist()

In [8]:
core_embeddings_model = OpenAIEmbeddings()

In [None]:
db.get()

In [9]:
vectorstore = Chroma(persist_directory=vectordb_path,embedding_function=core_embeddings_model)
    
retriever = vectorstore.as_retriever()

In [15]:
def generate_answer(context):

    system_template = """
        You are a legal expert tasked with acting as the best lawyer and contract analyzer. Your task is to thoroughly understand the provided context and answer questions related to legal matters, contracts, and relevant laws. You must provide accurate responses based solely on the information provided in the context. If the necessary information is not present in the context, respond with "I don't know.":
        ----
        ### CONTEXT:
        {context}
        \n
        ### # QUESTION:
        {question}
        <bot>:
        """
        
    user_template = "Question:```{question}```"
    messages = [
                SystemMessagePromptTemplate.from_template(system_template),
                HumanMessagePromptTemplate.from_template(user_template)
    ]
    qa_prompt = ChatPromptTemplate.from_messages( messages )
    
    llm = ChatOpenAI(model_name="gpt-3.5-turbo")
    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=context,
        chain_type='stuff',
        memory=memory,
        combine_docs_chain_kwargs={"prompt": qa_prompt}
    )
    return conversation_chain

In [16]:
conversation_chain = generate_answer(retriever)

  warn_deprecated(


In [17]:
user_question = "Are there any conditions to the closing?"

In [18]:
result = conversation_chain({"question": user_question})

  warn_deprecated(


In [19]:
result["answer"]

"Yes, there is a condition to the closing stated in the provided context. The Proposed Final Closing Statement and the determinations of the Closing Debt Amount, Closing Cash Amount, and Seller Transaction Expenses will be final, conclusive, and binding on the parties unless the Sellers' Representative provides a written Dispute Notice to the Buyer no later than the thirtieth (30th) Business Day after the delivery of the Proposed Final Closing Statement."

In [24]:
base_retriever = vectorstore.as_retriever(search_kwargs={"k" : 10})

In [26]:


template = """You are a legal expert tasked with acting as the best lawyer and contract analyzer. Your task is to thoroughly understand the provided context and answer questions related to legal matters, contracts, and relevant laws. You must provide accurate responses based solely on the information provided in the context. If the necessary information is not present in the context, respond with "I don't know.".
If the question can be answered as either yes or no, respond with either "Yes." or "No." first and include the explanation in your response.:

### CONTEXT
{context}

### QUESTION
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [26]:


primary_qa_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<<SOME USER QUESTION>>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | base_retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

In [27]:
question_schema = ResponseSchema(
    name="question",
    description="a question about the context."
)

question_response_schemas = [
    question_schema,
]

In [28]:
question_output_parser = StructuredOutputParser.from_response_schemas(question_response_schemas)
format_instructions = question_output_parser.get_format_instructions()

In [29]:
question_generation_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")

bare_prompt_template = "{content}"
bare_template = ChatPromptTemplate.from_template(template=bare_prompt_template)

In [30]:
from langchain.prompts import ChatPromptTemplate

qa_template = """\
You are a University Professor creating a test for advanced students in law. For each context, create a question that is specific to the context. Avoid creating generic or general questions.

question: a question about the context.

Format the output as JSON with the following keys:
question

context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=docs[0].page_content,
    format_instructions=format_instructions
)

question_generation_chain = bare_template | question_generation_llm

response = question_generation_chain.invoke({"content" : messages})
output_dict = question_output_parser.parse(response.content)

In [31]:
output_dict

{'question': 'What is the date of the Stock Purchase Agreement?'}

In [32]:
for k, v in output_dict.items():
  print(k)
  print(v)

question
What is the date of the Stock Purchase Agreement?


In [33]:
qac_triples = []

for text in tqdm(docs[4:10]):
  messages = prompt_template.format_messages(
      context=text,
      format_instructions=format_instructions
  )
  response = question_generation_chain.invoke({"content" : messages})
  try:
    output_dict = question_output_parser.parse(response.content)
  except Exception as e:
    continue
  output_dict["context"] = text
  qac_triples.append(output_dict)

100%|██████████| 6/6 [00:11<00:00,  1.94s/it]


In [34]:
qac_triples[3]

{'question': 'What is the purpose of Section 6.07 in the contract?',
 'context': Document(page_content='Section 6.07 Further Assurances 43  ARTICLE VII TAX MATTERS 43  Section 7.01 Tax Sharing Agreements 43  Section 7.02 Certain Taxes and Fees 43  Section 7.03 Cooperation on Tax Matters 44  ARTICLE VIII SURVIVAL; RECOURSE LIMITATIONS 44  Section 8.01 Survival 44  Section 8.02 Recourse Limitations. 44    ARTICLE IX  MISCELLANEOUS 45  Section 9.01 Notices 45  Section 9.02 Succession and Assignment; No Third-Party Beneficiaries 46  Section 9.03 Amendments and Waivers 46', metadata={'source': '../data/contract_data/Raptor_Contract.docx'})}

In [35]:
answer_generation_llm = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)

answer_schema = ResponseSchema(
    name="answer",
    description="an answer to the question"
)

answer_response_schemas = [
    answer_schema,
]

answer_output_parser = StructuredOutputParser.from_response_schemas(answer_response_schemas)
format_instructions = answer_output_parser.get_format_instructions()

qa_template = """\
You are a University Professor creating a test for advanced students in law. For each question and context, create an answer.

answer: a answer about the context.

Format the output as JSON with the following keys:
answer

question: {question}
context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

messages = prompt_template.format_messages(
    context=qac_triples[0]["context"],
    question=qac_triples[0]["question"],
    format_instructions=format_instructions
)

answer_generation_chain = bare_template | answer_generation_llm

response = answer_generation_chain.invoke({"content" : messages})
output_dict = answer_output_parser.parse(response.content)

In [36]:
for k, v in output_dict.items():
  print(k)
  print(v)

answer
Section 3.08 of the contract likely pertains to the obligations and conditions related to any debts and guarantees of the acquired companies. This section would address the current indebtedness, the terms and conditions of that indebtedness, any guarantees provided by the acquired companies, and the implications of those guarantees. It may also outline the representations and warranties related to the financial liabilities that the acquired companies hold, as well as any covenants regarding the incurrence of new debt or the provision of future guarantees post-acquisition.
question
What is the purpose of Section 3.08 of the contract?


In [37]:
for triple in tqdm(qac_triples):
  messages = prompt_template.format_messages(
      context=triple["context"],
      question=triple["question"],
      format_instructions=format_instructions
  )
  response = answer_generation_chain.invoke({"content" : messages})
  try:
    output_dict = answer_output_parser.parse(response.content)
  except Exception as e:
    continue
  triple["answer"] = output_dict["answer"]

100%|██████████| 5/5 [00:37<00:00,  7.43s/it]


In [38]:
import pandas as pd
from datasets import Dataset

ground_truth_qac_set = pd.DataFrame(qac_triples)
ground_truth_qac_set["context"] = ground_truth_qac_set["context"].map(lambda x: str(x))
ground_truth_qac_set = ground_truth_qac_set.rename(columns={"answer" : "ground_truth"})


eval_dataset = Dataset.from_pandas(ground_truth_qac_set)

In [39]:
eval_dataset

Dataset({
    features: ['question', 'context', 'ground_truth'],
    num_rows: 5
})

In [40]:
eval_dataset[0]

{'question': 'What is the purpose of Section 3.08 of the contract?',
 'context': "page_content='Section 3.04 Noncontravention 20  Section 3.05 Capitalization of the Acquired Companies 21  Section 3.06 Financial Matters 22  Section 3.07 Absence of Certain Developments 22  Section 3.08 Debt; Guarantees 24  Section 3.09 Assets 25  Section 3.10 Real Property 25  Section 3.11 Intellectual Property 26  Section 3.12 Legal Compliance; Illegal Payments; Permits 29  Section 3.13 Tax Matters 30  Section 3.14 Employee Benefit Plans 32  Section 3.15 Environmental Matters 33  Section 3.16 Contracts 34' metadata={'source': '../data/contract_data/Raptor_Contract.docx'}",
 'ground_truth': "The purpose of Section 3.08 of the contract, titled 'Debt; Guarantees', is likely to outline the obligations and conditions related to any debts of the acquired companies and the guarantees provided by or to them. This section would detail the existing debts, the terms of repayment, any covenants or restrictions asso

In [41]:
eval_dataset.to_csv("../data/ground_truth/groundtruth_eval_dataset.csv")

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 73.37ba/s]


5045

In [42]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity
)

from ragas.metrics.critique import harmfulness
from ragas import evaluate

def create_ragas_dataset(rag_pipeline, eval_dataset):
  rag_dataset = []
  for row in tqdm(eval_dataset):
    answer = rag_pipeline.invoke({"question" : row["question"]})
    rag_dataset.append(
        {"question" : row["question"],
         "answer" : answer["response"].content,
         "contexts" : [context.page_content for context in answer["context"]],
         "ground_truths" : [row["ground_truth"]]
         }
    )
  rag_df = pd.DataFrame(rag_dataset)
  rag_eval_dataset = Dataset.from_pandas(rag_df)
  return rag_eval_dataset

def evaluate_ragas_dataset(ragas_dataset):
  result = evaluate(
    ragas_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        context_relevancy,
        answer_correctness,
        answer_similarity
    ],
  )
  return result

In [43]:
from tqdm import tqdm
import pandas as pd

basic_qa_ragas_dataset = create_ragas_dataset(retrieval_augmented_qa_chain, eval_dataset)

100%|██████████| 5/5 [00:12<00:00,  2.55s/it]


In [44]:
basic_qa_ragas_dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truths'],
    num_rows: 5
})

In [45]:
basic_qa_ragas_dataset[0]

{'question': 'What is the purpose of Section 3.08 of the contract?',
 'answer': "I don't know.",
 'contexts': ['If the final judgment of a court of competent jurisdiction declares that any term or provision of this Section 6.08 is invalid or unenforceable, the parties hereto agree that the court making the determination of invalidity or unenforceability will have the power to reduce the scope, duration, or area of the term or provision, to delete or modify specific words or phrases, or to replace any invalid or unenforceable term or provision with a term or provision that is valid and',
  'If the final judgment of a court of competent jurisdiction declares that any term or provision of this Section 6.08 is invalid or unenforceable, the parties hereto agree that the court making the determination of invalidity or unenforceability will have the power to reduce the scope, duration, or area of the term or provision, to delete or modify specific words or phrases, or to replace any invalid o

In [46]:
basic_qa_result = evaluate_ragas_dataset(basic_qa_ragas_dataset)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 35/35 [00:33<00:00,  1.06it/s]


In [47]:
basic_qa_result

{'context_precision': 0.7540, 'faithfulness': 1.0000, 'answer_relevancy': 0.2000, 'context_recall': 0.8000, 'context_relevancy': 0.0981, 'answer_correctness': 0.2912, 'answer_similarity': 0.7649}

### Testing Other Retrievers

Now we can test our how changing our Retriever impacts our RAGAS evaluation!

We'll build this simple qa_chain factory to create standardized qa_chains where the only different component will be the retriever.

In [27]:
def create_qa_chain(retriever):
  primary_qa_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
  created_qa_chain = (
    {"context": itemgetter("question") | retriever,
     "question": itemgetter("question")
    }
    | RunnablePassthrough.assign(
        context=itemgetter("context")
      )
    | {
         "response": prompt | primary_qa_llm,
         "context": itemgetter("context"),
      }
  )

  return created_qa_chain

#### Parent Document Retriever

One of the easier ways we can imagine improving a retriever is to embed our documents into small chunks, and then retrieve a significant amount of additional context that "surrounds" the found context.

You can read more about this method [here](https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever)!

The basic outline of this retrieval method is as follows:

1. Obtain User Question
2. Retrieve child documents using Dense Vector Retrieval
3. Merge the child documents based on their parents. If they have the same parents - they become merged.
4. Replace the child documents with their respective parent documents from an in-memory-store.
5. Use the parent documents to augment generation.

In [28]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

parent_splitter = RecursiveCharacterTextSplitter(chunk_size=500)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)

vectorstore = Chroma(collection_name="split_parents", embedding_function=OpenAIEmbeddings())

store = InMemoryStore()

In [29]:
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [30]:
parent_document_retriever.add_documents(docs)

Let's create, test, and then evaluate our new chain!

In [31]:
parent_document_retriever_qa_chain = create_qa_chain(parent_document_retriever)

In [34]:
parent_document_retriever_qa_chain.invoke({"question" : "How much is the escrow amount?"})["response"].content

'Answer: The escrow amount is $1,000,000 as stated in the provided context.'

In [35]:
pdr_qa_ragas_dataset = create_ragas_dataset(parent_document_retriever_qa_chain, eval_dataset)

NameError: name 'create_ragas_dataset' is not defined

In [55]:
pdr_qa_ragas_dataset.to_pandas().to_csv("../data/pdr_qa_ragas_dataset.csv")

In [56]:
pdr_qa_result = evaluate_ragas_dataset(pdr_qa_ragas_dataset)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 35/35 [00:22<00:00,  1.58it/s]


In [57]:
pdr_qa_result

{'context_precision': 0.7833, 'faithfulness': 1.0000, 'answer_relevancy': 0.2000, 'context_recall': 0.8000, 'context_relevancy': 0.3822, 'answer_correctness': 0.2765, 'answer_similarity': 0.7630}

#### Ensemble Retrieval

Next let's look at ensemble retrieval!

You can read more about this [here](https://python.langchain.com/docs/modules/data_connection/retrievers/ensemble)!

The basic idea is as follows:

1. Obtain User Question
2. Hit the Retriever Pair
    - Retrieve Documents with BM25 Sparse Vector Retrieval
    - Retrieve Documents with Dense Vector Retrieval Method
3. Collect and "fuse" the retrieved docs based on their weighting using the [Reciprocal Rank Fusion](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) algorithm into a single ranked list.
4. Use those documents to augment our generation.

Ensure your `weights` list - the relative weighting of each retriever - sums to 1!

In [9]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=10)
b_docs = text_splitter.split_documents(docs)

bm25_retriever = BM25Retriever.from_documents(b_docs)
bm25_retriever.k = 3

embedding = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(b_docs, embedding)
chroma_retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever], weights=[0.75, 0.25])

In [14]:
ensemble_retriever_qa_chain = create_qa_chain(ensemble_retriever)

In [15]:
ensemble_retriever_qa_chain.invoke({"question" : "Does the Buyer needs the Sellers’ consent in the event of an assignment of the Agreement to a third party who is not a Buyer’s Affiliates?"})["response"].content

'Yes.\n\nExplanation: The document states that no party may assign the Agreement or any of its rights, interests, or obligations without the prior written approval of the other parties, with the Sellers’ Representative acting for all of the Sellers. Therefore, in the event of an assignment to a third party who is not a Buyer’s Affiliate, the Buyer would need the Sellers’ consent.'

In [16]:
ensemble_qa_ragas_dataset = create_ragas_dataset(ensemble_retriever_qa_chain, eval_dataset)

NameError: name 'create_ragas_dataset' is not defined

In [62]:
ensemble_qa_ragas_dataset.to_csv("../data/ensemble_qa_ragas_dataset.csv")

Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 152.33ba/s]


20772

In [63]:
ensemble_qa_result = evaluate_ragas_dataset(ensemble_qa_ragas_dataset)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 35/35 [00:57<00:00,  1.63s/it]


In [64]:
ensemble_qa_result

{'context_precision': 0.5073, 'faithfulness': 1.0000, 'answer_relevancy': 0.4000, 'context_recall': 0.8000, 'context_relevancy': 0.0298, 'answer_correctness': 0.3637, 'answer_similarity': 0.8146}

In [65]:
basic_qa_result

{'context_precision': 0.7540, 'faithfulness': 1.0000, 'answer_relevancy': 0.2000, 'context_recall': 0.8000, 'context_relevancy': 0.0981, 'answer_correctness': 0.2912, 'answer_similarity': 0.7649}

In [66]:
pdr_qa_result

{'context_precision': 0.7833, 'faithfulness': 1.0000, 'answer_relevancy': 0.2000, 'context_recall': 0.8000, 'context_relevancy': 0.3822, 'answer_correctness': 0.2765, 'answer_similarity': 0.7630}

In [67]:
ensemble_qa_result_df = ensemble_qa_result.to_pandas()

In [68]:
ensemble_qa_result_df

Unnamed: 0,question,answer,contexts,ground_truths,ground_truth,context_precision,faithfulness,answer_relevancy,context_recall,context_relevancy,answer_correctness,answer_similarity
0,What is the purpose of Section 3.08 of the con...,Answer: The purpose of Section 3.08 of the con...,[Debt; Guarantees. The Acquired Companies hav...,"[The purpose of Section 3.08 of the contract, ...","The purpose of Section 3.08 of the contract, t...",0.665608,1.0,1.0,1.0,0.05,0.741031,0.964122
1,What is the purpose of Section 3.17 in the con...,I don't know.,"[At the Closing, Buyer shall deposit the Escro...","[The purpose of Section 3.17 in the contract, ...","The purpose of Section 3.17 in the contract, t...",0.639683,,0.0,1.0,0.043478,0.176014,0.704056
2,What is the title of Section 4.05 in the contr...,I don't know.,[Title. Such Seller is the record and benefic...,[The title of Section 4.05 is not provided in ...,The title of Section 4.05 is not provided in t...,0.0,,0.0,0.0,0.0,0.188962,0.755653
3,What is the purpose of Section 6.07 in the con...,I don't know.,"[At the Closing, Buyer shall deposit the Escro...","[The purpose of Section 6.07, titled 'Further ...","The purpose of Section 6.07, titled 'Further A...",0.397727,,0.0,1.0,0.0,0.176091,0.704362
4,"Who are the shareholders, optionholders, and w...","Answer: The shareholders, optionholders, and w...","[I List of Shareholders, Optionholders and War...","[The shareholders, optionholders, and warranth...","The shareholders, optionholders, and warrantho...",0.833333,1.0,1.0,1.0,0.055556,0.536179,0.944701


In [69]:
def create_df_dict(pipeline_name, pipeline_items):
  df_dict = {"name" : pipeline_name}
  for name, score in pipeline_items:
    df_dict[name] = score
  return df_dict

In [70]:
basic_rag_df_dict = create_df_dict("basic_rag", basic_qa_result.items())

In [71]:
pdr_rag_df_dict = create_df_dict("pdr_rag", pdr_qa_result.items())

In [72]:
ensemble_rag_df_dict = create_df_dict("ensemble_rag", ensemble_qa_result.items())

In [73]:
results_df = pd.DataFrame([basic_rag_df_dict, pdr_rag_df_dict, ensemble_rag_df_dict])

In [74]:
results_df.sort_values("answer_correctness", ascending=False)

Unnamed: 0,name,context_precision,faithfulness,answer_relevancy,context_recall,context_relevancy,answer_correctness,answer_similarity
2,ensemble_rag,0.50727,1.0,0.4,0.8,0.029807,0.363655,0.814579
0,basic_rag,0.753995,1.0,0.2,0.8,0.098095,0.291226,0.764901
1,pdr_rag,0.783333,1.0,0.2,0.8,0.382222,0.276457,0.76295


In [9]:
from langchain_community.retrievers import BM25Retriever

In [10]:
bm25_retriever = BM25Retriever.from_documents(docs)
bm25_retriever.k = 2  # Retrieve top 2 results

print("type of bm25", type(bm25_retriever))

type of bm25 <class 'langchain_community.retrievers.bm25.BM25Retriever'>
