In [1]:
%reload_ext dotenv
%dotenv

In [3]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_community.document_loaders import DirectoryLoader,Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma

from langchain.prompts import ChatPromptTemplate,SystemMessagePromptTemplate,HumanMessagePromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from operator import itemgetter

from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser
from tqdm import tqdm
import pandas as pd
from datasets import Dataset
import os
import chromadb
import shutil
import random
import re
from dotenv import load_dotenv
load_dotenv()


OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

import sys
sys.path.append('../')

In [4]:
def remove_special_characters(input_string):
    # Define a regex pattern to match the special characters
    pattern = r'[\t●\n\[\]]'
    # Use re.sub() to replace matches of the pattern with an empty string
    cleaned_string = re.sub(pattern, ' ', input_string)
    return cleaned_string

# Q&A Dataset preparation

In [4]:
def create_QA(file_path):
    import docx2txt
    import re
    import pandas as pd

    # Read the text from the .docx file
    text = docx2txt.process(file_path)

    # Define regex patterns for identifying questions and answers
    question_pattern = re.compile(r"Q\d+: (.+?)\n")
    answer_pattern = re.compile(r"A\d+: (.+?)\n")

    # Find all matches of questions and answers
    questions = question_pattern.findall(text)
    answers = answer_pattern.findall(text)

    # Create a dictionary to store the data
    data = {'question': questions, 'ground_truth': answers}

    # Convert the dictionary to a pandas DataFrame
    df = pd.DataFrame(data)

    return df


In [5]:
vector_db_path = '../data/chromadb/'
data_path = '../data/contract_data/'
qa_data_path = '../data/Evaluation Sets/Raptor Q&A2.docx'

In [6]:
df_qa = create_QA(qa_data_path)

In [7]:
eval_dataset = Dataset.from_pandas(df_qa)

# Loading data from a directory

In [8]:

loader = DirectoryLoader(data_path, show_progress=True)
documents = loader.load()

100%|██████████| 1/1 [00:17<00:00, 17.87s/it]


# Data Cleaning

In [9]:

documents[0].page_content = remove_special_characters(documents[0].page_content)
documents[0].page_content=re.sub(r'\s+', ' ',documents[0].page_content )

In [10]:
# Chunking the data

In [11]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n","\n",".", " ",""],
    chunk_size=500,
    chunk_overlap=10
    )
docs = text_splitter.split_documents(documents)

## Create vector store using Chroma and save it locally

In [12]:

db = Chroma.from_documents(docs, OpenAIEmbeddings(),persist_directory=vector_db_path)
db.persist()

## Embedding Model

In [13]:
core_embeddings_model = OpenAIEmbeddings()

## Loading data from local vector store

In [14]:

vector_store = Chroma(persist_directory=vector_db_path,embedding_function=core_embeddings_model)
    
retriever = vector_store.as_retriever()

## Prompt template

In [15]:


template = """You are a legal expert tasked with acting as the best lawyer and contract analyzer. Your task is to thoroughly understand the provided context and answer questions related to legal matters, contracts, and relevant laws. If the necessary information is not present in the context use the given context, then get related contexts and answer the question. If the question cannot be answered, respond with "I don't know.".
If the question can be answered as either yes or no, respond with either "Yes." or "No." first and include the explanation in your response.
You must provide accurate responses based solely on the information provided in the context:

### CONTEXT
{context}

### QUESTION
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [16]:
def create_qa_chain(retriever):
  primary_qa_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
  created_qa_chain = (
    {"context": itemgetter("question") | retriever,
     "question": itemgetter("question")
    }
    | RunnablePassthrough.assign(
        context=itemgetter("context")
      )
    | {
         "response": prompt | primary_qa_llm,
         "context": itemgetter("context"),
      }
  )

  return created_qa_chain

In [17]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    context_relevancy,
    answer_correctness,
    answer_similarity
)

from ragas.metrics.critique import harmfulness
from ragas import evaluate

def create_ragas_dataset(rag_pipeline, eval_dataset):
  rag_dataset = []
  for row in tqdm(eval_dataset):
    answer = rag_pipeline.invoke({"question" : row["question"]})
    rag_dataset.append(
        {"question" : row["question"],
         "answer" : answer["response"].content,
         "contexts" : [context.page_content for context in answer["context"]],
         "ground_truths" : [row["ground_truth"]]
         }
    )
  rag_df = pd.DataFrame(rag_dataset)
  rag_eval_dataset = Dataset.from_pandas(rag_df)
  return rag_eval_dataset

def evaluate_ragas_dataset(ragas_dataset):
  result = evaluate(
    ragas_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
        context_relevancy,
        answer_correctness,
        answer_similarity
    ],
  )
  return result

In [18]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=10)
b_docs = text_splitter.split_documents(docs)

bm25_retriever = BM25Retriever.from_documents(b_docs)
bm25_retriever.k = 10

embedding = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(b_docs, embedding)
chroma_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

#ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever], weights=[0.75, 0.25])

In [19]:
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever], weights=[0.75, 0.25])
ensemble_retriever_qa_chain = create_qa_chain(ensemble_retriever)

  warn_deprecated(


In [20]:
ensemble_retriever_qa_chain.invoke({"question" : "How much is the escrow amount?"})["response"].content

'Answer: The escrow amount is $1,000,000.'