In [6]:
import bs4
from langchain import hub
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [7]:
import os
os.chdir('../../')
os.getcwd()

'/teamspace/studios/this_studio/Legal_Expert_Contract_Advisor_Using_Precision_RAG'

In [8]:
import docx2txt

# extract text
text = docx2txt.process("data/raw/docx/Raptor Contract.docx", 'data/raw/docx/img')

# Save the text to a file
with open('data/raw/docx/Raptor Contract.txt', 'w') as f:
    f.write(text)

## INDEXING ####

#### Step 1 - Load Documents

In [9]:
def get_document_from_html(documents: list, file_path):
    # Load and parse HTML file found in the specified folder and subfolders
    html_files = [os.path.join(file_path, f) for f in os.listdir(file_path) if f.endswith('.html')]

    # Load and parse HTML files
    for file in html_files:
        loader = UnstructuredHTMLLoader(file)
        documents.extend(loader.load())

    return documents

In [10]:
def get_document_from_docx(documents: list, file_path):
    # Load and parse HTML file found in the specified folder and subfolders
    docx_files = [os.path.join(file_path, f) for f in os.listdir(file_path) if f.endswith('.docx')]

    # Load and parse HTML files
    for file in docx_files:
        loader = Docx2txtLoader(file)
        documents.extend(loader.load())

    return documents

In [6]:
docx_docs = get_document_from_docx([], "data/raw/docx")

In [7]:
docx_docs

[Document(page_content="[R&G Draft 12.__.2021]\n\n\t\t \n\n\t\t \n\n\n\n\n\n\n\nSTOCK PURCHASE AGREEMENT\n\nBY AND AMONG\n\n[BUYER],\n\n[TARGET COMPANY],\n\nTHE SELLERS LISTED ON SCHEDULE I HERETO\n\nAND\n\nTHE SELLERS’ REPRESENTATIVE NAMED HEREIN\n\nDated as of [●]\n\n\n\n[This document is intended solely to facilitate discussions among the parties identified herein.  Neither this document nor such discussions are intended to create, nor will either or both be deemed to create, a legally binding or enforceable offer or agreement of any type or nature, unless and until a definitive written agreement is executed and delivered by each of the parties hereto.\n\n\n\nThis document shall be kept confidential pursuant to the terms of the Confidentiality Agreement entered into by the parties and, if applicable, its affiliates with respect to the subject matter hereof.]\n\n\n\n\n\nTABLE OF CONTENTS\n\n\tARTICLE I DEFINITIONS; CERTAIN RULES OF CONSTRUCTION\t2\n\n\t\tSection 1.01\tDefinitions\t2\

In [8]:
file_path = "data/raw/html"

documents = get_document_from_html(docx_docs, file_path)

In [11]:
documents[0]

Document(page_content="[R&G Draft 12.__.2021]\n\n\t\t \n\n\t\t \n\n\n\n\n\n\n\nSTOCK PURCHASE AGREEMENT\n\nBY AND AMONG\n\n[BUYER],\n\n[TARGET COMPANY],\n\nTHE SELLERS LISTED ON SCHEDULE I HERETO\n\nAND\n\nTHE SELLERS’ REPRESENTATIVE NAMED HEREIN\n\nDated as of [●]\n\n\n\n[This document is intended solely to facilitate discussions among the parties identified herein.  Neither this document nor such discussions are intended to create, nor will either or both be deemed to create, a legally binding or enforceable offer or agreement of any type or nature, unless and until a definitive written agreement is executed and delivered by each of the parties hereto.\n\n\n\nThis document shall be kept confidential pursuant to the terms of the Confidentiality Agreement entered into by the parties and, if applicable, its affiliates with respect to the subject matter hereof.]\n\n\n\n\n\nTABLE OF CONTENTS\n\n\tARTICLE I DEFINITIONS; CERTAIN RULES OF CONSTRUCTION\t2\n\n\t\tSection 1.01\tDefinitions\t2\n

In [12]:
len(documents)

117

### Step 2 - Split Documents

In [13]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(documents)

In [14]:
splits[0]

Document(page_content='[R&G Draft 12.__.2021]\n\n\t\t \n\n\t\t \n\n\n\n\n\n\n\nSTOCK PURCHASE AGREEMENT\n\nBY AND AMONG\n\n[BUYER],\n\n[TARGET COMPANY],\n\nTHE SELLERS LISTED ON SCHEDULE I HERETO\n\nAND\n\nTHE SELLERS’ REPRESENTATIVE NAMED HEREIN\n\nDated as of [●]\n\n\n\n[This document is intended solely to facilitate discussions among the parties identified herein.  Neither this document nor such discussions are intended to create, nor will either or both be deemed to create, a legally binding or enforceable offer or agreement of any type or nature, unless and until a definitive written agreement is executed and delivered by each of the parties hereto.\n\n\n\nThis document shall be kept confidential pursuant to the terms of the Confidentiality Agreement entered into by the parties and, if applicable, its affiliates with respect to the subject matter hereof.]\n\n\n\n\n\nTABLE OF CONTENTS\n\n\tARTICLE I DEFINITIONS; CERTAIN RULES OF CONSTRUCTION\t2\n\n\t\tSection 1.01\tDefinitions\t2\n

In [15]:
len(splits) # length of splits

2617

### Step 3 - Embedding

In [16]:
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

#### RETRIEVAL and GENERATION ####

In [17]:
# Prompt
prompt = hub.pull("rlm/rag-prompt")

# LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [18]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [19]:
# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [20]:
# Question
answer = rag_chain.invoke("Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?")

In [22]:
# Question
answer = rag_chain.invoke("What is the purpose of the escrow?")

In [23]:
print(answer) # answer

The purpose of the escrow is to hold the Escrow Amount deposited by the Buyer on behalf of the Sellers and release it to the Company Securityholders according to the Escrow Agreement. The Escrow Agent will disburse payments from the Escrow Account based on the purchase price adjustment provisions of the Agreement and the terms of the Escrow Agreement. Disputes regarding disbursements will be resolved jointly by the Buyer and the Sellers' Representative in accordance with the provisions of the Escrow Agreement.


In [2]:
%pip install --upgrade --quiet  langchain langchain-community langchainhub langchain-openai langchain-chroma bs4

Note: you may need to restart the kernel to use updated packages.


In [1]:
import bs4
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)


### Construct retriever ###
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()


### Contextualize question ###
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)


### Answer question ###
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


### Statefully manage chat history ###
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [8]:
conversational_rag_chain.invoke(
    {"input": "What is Task Decomposition?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

'Task decomposition involves breaking down a complex task into smaller and simpler steps to make it more manageable for an agent or model. This process helps in guiding the agent through the various subgoals required to achieve the overall task efficiently. Task decomposition can be facilitated through techniques like Chain of Thought and Tree of Thoughts, which help in structuring the thinking process of the model by dividing the task into multiple manageable steps.'

In [9]:
conversational_rag_chain.invoke(
    {"input": "What the various sub goals are?"},
    config={
        "configurable": {"session_id": "abc123"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

'The various subgoals are the smaller, intermediate objectives that need to be achieved in order to accomplish the overall task successfully. These subgoals are part of the task decomposition process, where a complex task is broken down into more manageable steps. By identifying and addressing these subgoals, the agent or model can progress towards completing the larger task effectively.'

In [10]:
conversational_rag_chain.invoke(
    {"input": "What are common ways of doing it?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

'Common ways of task decomposition include using techniques like Chain of Thought (CoT) or Tree of Thoughts to break down complex tasks into smaller steps. This can be achieved through prompting the model with specific instructions or questions to guide its thinking process. Additionally, human inputs can also be used to assist in task decomposition, providing valuable insights and guidance for the model to follow.'

In [13]:
import os
os.environ["ASTRA_DB_API_ENDPOINT"] = "https://12b80b03-8ed8-4c9b-8fd7-c2bbf4504252-us-east-2.apps.astra.datastax.com" # Your database API endpoint
os.environ["ASTRA_DB_APPLICATION_TOKEN"] = "AstraCS:ypkrvouAzzNADisKnLIaKkUu:de583999860166fa5fcd5027a513e352f12c3a2dad517f0a3d17f9b54b6cbce4" # Your database application token

In [17]:
import cassio
a = {
  "clientId": "ypkrvouAzzNADisKnLIaKkUu",
  "secret": "PP4.+m5bt+CWoPwsx-6gwXxwsO,4f78fmKDx-v7Rc3uYj0gl0+.6kZ.W+i0+79W6JmZzNX.3m04ek9.Zp3AJLBvyooeGQlZGMkTqXrTbErwwaateAydfybtq0Ki+pznA",
  "token": "AstraCS:ypkrvouAzzNADisKnLIaKkUu:de583999860166fa5fcd5027a513e352f12c3a2dad517f0a3d17f9b54b6cbce4"
}

cassio.init(
    database_id="12b80b03-8ed8-4c9b-8fd7-c2bbf4504252",
    token=a["token"],
)

In [18]:
from cassio.table.cql import STANDARD_ANALYZER
from langchain_community.vectorstores import Cassandra
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
vectorstore = Cassandra(
    embedding=embeddings,
    table_name="test_hybrid",
    body_index_options=[STANDARD_ANALYZER],
    session=None,
    keyspace=None,
)

vectorstore.add_texts(
    [
        "In 2023, I visited Paris",
        "In 2022, I visited New York",
        "In 2021, I visited New Orleans",
    ]
)

['8040779a008b43b398c1f0e988803705',
 '4f8affdd61d8429f925eeebfcb212039',
 '63a7f58a1092404593311bb2ad62844d']

In [19]:
vectorstore.as_retriever().invoke("What city did I visit last?")

[Document(page_content='In 2022, I visited New York'),
 Document(page_content='In 2023, I visited Paris'),
 Document(page_content='In 2021, I visited New Orleans')]

In [20]:
vectorstore.as_retriever(search_kwargs={"body_search": "new"}).invoke(
    "What city did I visit last?"
)

[Document(page_content='In 2022, I visited New York'),
 Document(page_content='In 2021, I visited New Orleans')]

In [21]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import (
    ConfigurableField,
    RunnablePassthrough,
)
from langchain_openai import ChatOpenAI

In [22]:
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI()

retriever = vectorstore.as_retriever()

In [24]:
configurable_retriever = retriever.configurable_fields(
    search_kwargs=ConfigurableField(
        id="search_kwargs",
        name="Search Kwargs",
        description="The search kwargs to use",
    )
)

In [25]:
chain = (
    {"context": configurable_retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [26]:
chain.invoke("What city did I visit last?")

'Paris'

In [27]:
chain.invoke("What city did I visit first?")

'New Orleans'

In [28]:
import chromadb

persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_or_create_collection("collection_name")
collection.add(ids=["1", "2", "3"], documents=["a", "b", "c"])

langchain_chroma = Chroma(
    client=persistent_client,
    collection_name="collection_name",
    embedding_function=OpenAIEmbeddings(),
)

print("There are", langchain_chroma._collection.count(), "in the collection")

/teamspace/studios/this_studio/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:02<00:00, 33.7MiB/s]


There are 3 in the collection


In [30]:
langchain_chroma.asimilarity_search("a")

<coroutine object VectorStore.asimilarity_search at 0x7fc8f835fae0>

In [2]:
class Retriever:
    def __init__(self, documents: list):
        self.documents = documents
        self.persist_directory = 'db'
        self.db = Chroma.from_documents(documents, embedding=OpenAIEmbeddings(), persist_directory=self.persist_directory)
        self.vectordb = Chroma(persist_directory=self.persist_directory, embedding_function=OpenAIEmbeddings())
        # Embed and store documents
        self.embeddings = OpenAIEmbeddings()
        self.store_documents(documents)

    def store_documents(self, documents: list):
        for doc in documents:
            embedding = self.embeddings.embed(doc['text'])
            self.collection.add(doc['id'], embedding, doc)

    def retrieve(self, query: str):
        query_embedding = self.embeddings.embed(query)
        results = self.collection.query(query_embedding)
        return results

In [11]:
documents = get_document_from_docx([], "data/raw/docx")

In [12]:
documents

[Document(page_content="[R&G Draft 12.__.2021]\n\n\t\t \n\n\t\t \n\n\n\n\n\n\n\nSTOCK PURCHASE AGREEMENT\n\nBY AND AMONG\n\n[BUYER],\n\n[TARGET COMPANY],\n\nTHE SELLERS LISTED ON SCHEDULE I HERETO\n\nAND\n\nTHE SELLERS’ REPRESENTATIVE NAMED HEREIN\n\nDated as of [●]\n\n\n\n[This document is intended solely to facilitate discussions among the parties identified herein.  Neither this document nor such discussions are intended to create, nor will either or both be deemed to create, a legally binding or enforceable offer or agreement of any type or nature, unless and until a definitive written agreement is executed and delivered by each of the parties hereto.\n\n\n\nThis document shall be kept confidential pursuant to the terms of the Confidentiality Agreement entered into by the parties and, if applicable, its affiliates with respect to the subject matter hereof.]\n\n\n\n\n\nTABLE OF CONTENTS\n\n\tARTICLE I DEFINITIONS; CERTAIN RULES OF CONSTRUCTION\t2\n\n\t\tSection 1.01\tDefinitions\t2\

In [2]:
pip install langchain-ai21

Collecting langchain-ai21
  Downloading langchain_ai21-0.1.7-py3-none-any.whl.metadata (5.1 kB)
Collecting ai21<3.0.0,>=2.7.0 (from langchain-ai21)
  Downloading ai21-2.8.0-py3-none-any.whl.metadata (16 kB)
Collecting ai21-tokenizer<1.0.0,>=0.11.0 (from ai21<3.0.0,>=2.7.0->langchain-ai21)
  Downloading ai21_tokenizer-0.11.2-py3-none-any.whl.metadata (5.2 kB)
Collecting sentencepiece<1.0.0,>=0.2.0 (from ai21-tokenizer<1.0.0,>=0.11.0->ai21<3.0.0,>=2.7.0->langchain-ai21)
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Downloading langchain_ai21-0.1.7-py3-none-any.whl (15 kB)
Downloading ai21-2.8.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.3/76.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ai21_tokenizer-0.11.2-py3-none-any.whl (2.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0ma [36m0:0

In [1]:
from langchain_ai21 import AI21SemanticTextSplitter

TEXT = (
    '''We’ve all experienced reading long, tedious, and boring pieces of text - financial reports, "
    "legal documents, or terms and conditions (though, who actually reads those terms and conditions to be honest?).\n"
    "Imagine a company that employs hundreds of thousands of employees. In today's information "
    "overload age, nearly 30% of the workday is spent dealing with documents. There's no surprise "
    "here, given that some of these documents are long and convoluted on purpose (did you know that "
    "reading through all your privacy policies would take almost a quarter of a year?). Aside from "
    "inefficiency, workers may simply refrain from reading some documents (for example, Only 16% of "
    "Employees Read Their Employment Contracts Entirely Before Signing!).\nThis is where AI-driven summarization "
    "tools can be helpful: instead of reading entire documents, which is tedious and time-consuming, "
    "users can (ideally) quickly extract relevant information from a text. With large language models, "
    "the development of those tools is easier than ever, and you can offer your users a summary that is "
    "specifically tailored to their preferences.\nLarge language models naturally follow patterns in input "
    "(prompt), and provide coherent completion that follows the same patterns. For that, we want to feed "
    'them with several examples in the input ("few-shot prompt"), so they can follow through. '
    "The process of creating the correct prompt for your problem is called prompt engineering, "
    "and you can read more about it here.'''
)

semantic_text_splitter = AI21SemanticTextSplitter()
texts = [TEXT]
documents = semantic_text_splitter.create_documents(
    texts=texts, metadatas=[{"pikachu": "pika pika"}]
)

print(f"The text has been split into {len(documents)} Documents.")
for doc in documents:
    print(f"metadata: {doc.metadata}")
    print(f"text: {doc.page_content}")
    print("====")

The text has been split into 2 Documents.
metadata: {'pikachu': 'pika pika', 'source_type': 'normal_text'}
text: We’ve all experienced reading long, tedious, and boring pieces of text - financial reports, "
    "legal documents, or terms and conditions (though, who actually reads those terms and conditions to be honest?).

"
    "Imagine a company that employs hundreds of thousands of employees.

In today's information "
    "overload age, nearly 30% of the workday is spent dealing with documents.

There's no surprise "
    "here, given that some of these documents are long and convoluted on purpose (did you know that "
    "reading through all your privacy policies would take almost a quarter of a year?).

Aside from "
    "inefficiency, workers may simply refrain from reading some documents (for example, Only 16% of "
    "Employees Read Their Employment Contracts Entirely Before Signing!).
====
metadata: {'pikachu': 'pika pika', 'source_type': 'normal_text'}
text: This is where AI-d