# LangChain: Q&A over Documents


In [44]:
import os
from langchain_community.document_loaders import Docx2txtLoader

In [45]:

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [46]:
loader = Docx2txtLoader("data/Evaluation Sets/Raptor Contract.docx")

In [70]:
data=loader.load()
data

[Document(page_content="[R&G Draft 12.__.2021]\n\n\t\t \n\n\t\t \n\n\n\n\n\n\n\nSTOCK PURCHASE AGREEMENT\n\nBY AND AMONG\n\n[BUYER],\n\n[TARGET COMPANY],\n\nTHE SELLERS LISTED ON SCHEDULE I HERETO\n\nAND\n\nTHE SELLERS’ REPRESENTATIVE NAMED HEREIN\n\nDated as of [●]\n\n\n\n[This document is intended solely to facilitate discussions among the parties identified herein.  Neither this document nor such discussions are intended to create, nor will either or both be deemed to create, a legally binding or enforceable offer or agreement of any type or nature, unless and until a definitive written agreement is executed and delivered by each of the parties hereto.\n\n\n\nThis document shall be kept confidential pursuant to the terms of the Confidentiality Agreement entered into by the parties and, if applicable, its affiliates with respect to the subject matter hereof.]\n\n\n\n\n\nTABLE OF CONTENTS\n\n\tARTICLE I DEFINITIONS; CERTAIN RULES OF CONSTRUCTION\t2\n\n\t\tSection 1.01\tDefinitions\t2\

In [76]:
print(type(loader))

<class 'langchain_community.document_loaders.word_document.Docx2txtLoader'>


In [63]:
from docx import Document

In [79]:
text_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", "\n", " ", ""],    
    chunk_size = 1600,
    chunk_overlap= 200
)

# Text splitting
chunks = text_splitter.split_documents(documents=data)

In [81]:
print(chunks[100])

page_content='[Reserved].\n\nExcept as set forth in Schedule 3.13(f), no Acquired Company has made any payments, or has been or is a party to any Contractual Obligation that obligate it to make payments, that have resulted or would result, separately or in the aggregate, in the payment of any “excess parachute payment” within the meaning of Code Section 280G or in the imposition of an excise Tax under Code Section 4999 (or any corresponding provisions of state, local or foreign Tax law) or that were or would not be deductible under Code Sections 162 or 404.  No Acquired Company is obligated or otherwise intends to pay any gross-up, make whole or reimbursement for any Taxes imposed under Code Section 4999 or 409A (or any corresponding provisions of state, local or foreign Tax law).\n\nNo Acquired Company has ever been a member of an “affiliated group” within the meaning of Code Section\xa01504(a) filing a consolidated federal income Tax Return (other than the “affiliated group” the comm

In [48]:
# account for deprecation of LLM model
import datetime
# Get the current date
current_date = datetime.datetime.now().date()

# Define the date after which the model should be set to "gpt-3.5-turbo"
target_date = datetime.date(2024, 6, 12)

# Set the model variable based on the current date
if current_date > target_date:
    llm_model = "gpt-4"
else:
    llm_model = "gpt-3.5-turbo-0301"

In [49]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import DocArrayInMemorySearch
from IPython.display import display, Markdown
from langchain.llms import OpenAI

In [50]:
from langchain.indexes import VectorstoreIndexCreator

In [83]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

In [52]:
docs = loader.load()

In [53]:
docs[0]

Document(page_content="[R&G Draft 12.__.2021]\n\n\t\t \n\n\t\t \n\n\n\n\n\n\n\nSTOCK PURCHASE AGREEMENT\n\nBY AND AMONG\n\n[BUYER],\n\n[TARGET COMPANY],\n\nTHE SELLERS LISTED ON SCHEDULE I HERETO\n\nAND\n\nTHE SELLERS’ REPRESENTATIVE NAMED HEREIN\n\nDated as of [●]\n\n\n\n[This document is intended solely to facilitate discussions among the parties identified herein.  Neither this document nor such discussions are intended to create, nor will either or both be deemed to create, a legally binding or enforceable offer or agreement of any type or nature, unless and until a definitive written agreement is executed and delivered by each of the parties hereto.\n\n\n\nThis document shall be kept confidential pursuant to the terms of the Confidentiality Agreement entered into by the parties and, if applicable, its affiliates with respect to the subject matter hereof.]\n\n\n\n\n\nTABLE OF CONTENTS\n\n\tARTICLE I DEFINITIONS; CERTAIN RULES OF CONSTRUCTION\t2\n\n\t\tSection 1.01\tDefinitions\t2\n

In [87]:
from langchain.embeddings import OpenAIEmbeddings
embeddings_model = OpenAIEmbeddings()

In [88]:
embeddings = embeddings_model.embed_documents(
    [
        "Hi there!",
        "Oh, hello!",
        "What's your name?",
        "My friends call me World",
        "Hello World!"
    ]
)
len(embeddings), len(embeddings[0])



(5, 1536)

In [89]:
embed = embeddings_model.embed_query("Hi my name is Elias")

In [90]:
print(len(embed))

1536


In [92]:
db = DocArrayInMemorySearch.from_documents(
    docs, 
    embeddings_model
)

In [94]:
query = "Please suggest what tax return is"

In [96]:
embed = embeddings_model.embed_query(query)

In [None]:
docs = db.similarity_search(embed)

In [None]:
llm_replacement_model = OpenAI(temperature=0, 
                               model='gpt-3.5-turbo-instruct'                              
                               
                               )

response = index.query(embed)

In [None]:
display(Markdown(response))