# This Notebook Explores using Langchain to extract important information and answer questions from complex documents
# We are trying to answer some questions from a legal document

In [12]:
data_dir = "userdata/"
with open('../keys/OpenAI.txt', 'r') as file:
    # Read the content of the file and save it in a variable
    openai_api_key = file.read().strip()



In [13]:
from langchain import OpenAI

# The vectorstore we'll be using
from langchain.vectorstores import FAISS

# The LangChain component we'll use to get the documents
from langchain.chains import RetrievalQA

# The easy document loader for text
from langchain.document_loaders import TextLoader

# The embedding engine that will convert our text to vectors
from langchain.embeddings.openai import OpenAIEmbeddings

llm = OpenAI(temperature=0, openai_api_key=openai_api_key)

In [14]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [15]:
import os
root_dir = data_dir
docs = []

# Go through each folder
for dirpath, dirnames, filenames in os.walk(root_dir):
    
    # Go through each file
    for file in filenames:
        try: 
            # Load up the file as a doc and split
            if(".txt" in file):
                loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
            elif (".pdf" in file):
                loader = PyPDFLoader(os.path.join(dirpath, file))
            doc = loader.load()
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=400)
            docs.extend(text_splitter.split_documents(doc))
        except Exception as e: 
            print(e)

In [16]:
# Get the total number of characters so we can see the average later
num_total_characters = sum([len(x.page_content) for x in docs])

print (f"Now you have {len(docs)} documents that have an average of {num_total_characters / len(docs):,.0f} characters (smaller pieces)")

Now you have 8 documents that have an average of 1,726 characters (smaller pieces)


In [17]:
# Get your embeddings engine ready
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

# Embed your documents and combine with the raw text in a pseudo db. Note: This will make an API call to OpenAI
docsearch = FAISS.from_documents(docs, embeddings)

In [18]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())

In [19]:
query = "write a similar services agreement between A Pvt. Ltd. and B Pvt. Ltd where A is service provider for cloud services to B"
qa.run(query)

" I don't know."

In [20]:
context = """
Rachel is 30 years old
Bob is 45 years old
Kevin is 65 years old
"""

question = "Who is under 40 years old?"

llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
output = llm(context + question)

# I strip the text to remove the leading and trailing whitespace
print (output.strip())

Rachel is under 40 years old.


In [21]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain

In [22]:
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# documents = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
vectorstore = Chroma.from_documents(docs, embeddings)

Using embedded DuckDB without persistence: data will be transient


In [48]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [52]:
# qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=1, openai_api_key=openai_api_key), vectorstore.as_retriever(), memory=memory, return_source_documents=True)
qa = ConversationalRetrievalChain.from_llm(OpenAI(temperature=1, openai_api_key=openai_api_key), vectorstore.as_retriever(), return_source_documents=True)

In [53]:
query = "Using this sample document, draft an agreement for me between A and B"
result = qa({"question": query})

In [None]:
chat_history = []
query = "What did the president say about Ketanji Brown Jackson"
result = qa({"question": query, "chat_history": chat_history})

In [54]:
result['source_documents'][0]

Document(page_content='7  \nName: [__]  \nDesignation: [__]  \n \n \n \nFor and on behalf of [●] \n \n \n_______________________  \n \nName: [ __] \nDesignation: [__]', metadata={'source': 'userdata/sample_agreement.pdf', 'page': 6})

In [44]:
query = "Start writing the document, and ask questions wherever required for drafting"
result = qa({"question": query})

In [45]:
result["answer"]

'  The best way to begin writing the document is by identifying each party involved, their roles and responsibilities, and what the main points of the agreement are. Ask questions throughout the drafting process as they come up, so that any issues can be addressed quickly.'

In [56]:
from langchain.chains import LLMBashChain
from langchain.llms import OpenAI

llm = OpenAI(temperature=0, openai_api_key=openai_api_key)

text = "Please write a bash script that prints 'Hello World' to the console."

bash_chain = LLMBashChain.from_llm(llm, verbose=True)

bash_chain.run(text)



[1m> Entering new LLMBashChain chain...[0m
Please write a bash script that prints 'Hello World' to the console.[32;1m[1;3m

```bash
echo "Hello World"
```[0m
Code: [33;1m[1;3m['echo "Hello World"'][0m
Answer: [33;1m[1;3m"Hello World"
[0m
[1m> Finished chain.[0m


'"Hello World"\r\n'

In [57]:
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
from langchain.prompts import PromptTemplate

In [60]:
base_embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
llm = OpenAI(openai_api_key=openai_api_key)

In [None]:
prompt_template = """Please answer the user's question about the most recent state of the union address
Question: {question}
Answer:"""
prompt = PromptTemplate(input_variables=["question"], template=prompt_template)
llm_chain = LLMChain(llm=llm, prompt=prompt)

In [61]:
from langchain.llms import OpenAI
from langchain.docstore.document import Document
import requests
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate
import pathlib
import subprocess
import tempfile

In [62]:
search_index = Chroma.from_documents(docs, OpenAIEmbeddings(openai_api_key=openai_api_key))

Using embedded DuckDB without persistence: data will be transient


In [64]:
from langchain.chains import LLMChain
prompt_template = """Use the context below to write a similar agreement with changes as described in Prompt:
    Context: {context}
    Prompt: {prompt}
    Document:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "prompt"]
)

llm = OpenAI(temperature=0, openai_api_key=openai_api_key)

chain = LLMChain(llm=llm, prompt=PROMPT)

In [70]:
topic = "A is cloud service provider to B, scope of services to be defined"
def generate_agreement(prompt):
    docs = search_index.similarity_search(prompt, k=4)
    inputs = [{"context": doc.page_content, "prompt": prompt} for doc in docs]
    print(chain.apply(inputs))

In [71]:
generate_agreement(prompt=topic)

[{'text': '\nThis service agreement (“Agreement”) has been executed on the [●] day of [●], 20 [●] \n \nBY AND BETWEEN  \n \n(A) [●], a cloud service provider, duly incorporated and validly existing under the (Indian) Companies \nAct, 2013, and having its registered office at [●] (hereinafter referred to as  the “Service Provider ”, which \nexpression will, unless repugnant to the context, include it s successors and assigns) of the FIRST \nPART  \n \nAND  \n \n(B) [●], a private limited company , duly incorporated and validly existing under the (Indian) Companies \nAct, 2013 , and having  its registered office at [●] (hereinafter referred to as the “Company ”, \nwhich expression will, unless repugnant to the context , include  its successors and assigns) of the \nSECOND  PART . \n \nThe Service Provider and the Company are hereinafter individually referred to as a “Party” and collectively \nas the “Parties”.  \n \n'}, {'text': '\n\n1.1 Scope of Services\n\n(a) During the Term (as defin