In [None]:
!pip install langchain langchain-community rank_bm25 pypdf unstructured chromadb
!pip install unstructured['pdf'] unstructured
!apt-get install poppler-utils
!apt-get install -y tesseract-ocr
!apt-get install -y libtesseract-dev
!pip install pytesseract
!pip install bitsandbytes accelerate peft safetensors sentencepiece

In [1]:
import os
os.getcwd()

'/teamspace/studios/this_studio/Legal_Expert_Contract_Advisor_Using_Precision_RAG/notebooks/exploration'

In [2]:
os.chdir('../..')
os.getcwd()

'/teamspace/studios/this_studio/Legal_Expert_Contract_Advisor_Using_Precision_RAG'

In [3]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.llms import HuggingFaceHub
import torch
from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, )
from langchain import HuggingFacePipeline

from langchain.retrievers import BM25Retriever, EnsembleRetriever

import os

In [4]:
from langchain_community.document_loaders import Docx2txtLoader
def get_document_from_docx(documents: list, file_path):
    # Load and parse HTML file found in the specified folder and subfolders
    docx_files = [os.path.join(file_path, f) for f in os.listdir(file_path) if f.endswith('.docx')]

    # Load and parse HTML files
    for file in docx_files:
        loader = Docx2txtLoader(file)
        documents.extend(loader.load())

    return documents

# Initialize the retriever with documents
documents = get_document_from_docx([], "data/raw/docx")

In [5]:
len(documents)

2

In [6]:
# create chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                          chunk_overlap=50)
chunks = splitter.split_documents(documents)

In [7]:
from langchain_openai import OpenAIEmbeddings

In [8]:
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(chunks, embeddings)

In [9]:
vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 4})
keyword_retriever = BM25Retriever.from_documents(chunks)
keyword_retriever.k =  3

In [10]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,
                                                   keyword_retriever],
                                       weights=[0.3, 0.7])

In [11]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [12]:
from langchain.chains import RetrievalQA

normal_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vectorstore_retreiver
)

hybrid_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=ensemble_retriever
)

In [13]:
normal_response = normal_chain.invoke("Who owns the IP?")
hybrid_response = hybrid_chain.invoke("Who owns the IP?")

In [14]:
normal_response

{'query': 'Who owns the IP?',
 'result': 'The Company owns all worldwide rights, titles, and interests in and to each item of Company Intellectual Property, except for Intellectual Property exclusively licensed to the Company pursuant to an Inbound IP Contract.'}

In [15]:
hybrid_response

{'query': 'Who owns the IP?',
 'result': 'The Company owns all worldwide rights, titles, and interests in and to each item of Company Intellectual Property, except for Intellectual Property exclusively licensed to the Company pursuant to an Inbound IP Contract.'}

In [None]:
%pip install --upgrade --quiet  weaviate-client

In [28]:
import weaviate

auth_config = weaviate.auth.AuthApiKey(api_key=os.getenv("WEAVIATE_API_KEY"))

client = weaviate.Client(
  url="https://sandbox-rag-hrim3oyf.weaviate.network",
  additional_headers={
        "X-Openai-Api-Key": os.getenv("OPENAI_API_KEY"),
  },
  auth_client_secret=auth_config
)

            your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.

            For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
            For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration
            


In [29]:
from langchain.retrievers.weaviate_hybrid_search import WeaviateHybridSearchRetriever

retriever = WeaviateHybridSearchRetriever(
    client=client,
    index_name="LangChain",
    text_key="text",
    attributes=[],
    create_schema_if_missing=True,
)

In [None]:
os.environ["OPENAI_APIKEY"] = os.getenv("OPENAI_API_KEY")

In [31]:
documents

[Document(page_content="[R&G Draft 12.__.2021]\n\n\t\t \n\n\t\t \n\n\n\n\n\n\n\nSTOCK PURCHASE AGREEMENT\n\nBY AND AMONG\n\n[BUYER],\n\n[TARGET COMPANY],\n\nTHE SELLERS LISTED ON SCHEDULE I HERETO\n\nAND\n\nTHE SELLERS’ REPRESENTATIVE NAMED HEREIN\n\nDated as of [●]\n\n\n\n[This document is intended solely to facilitate discussions among the parties identified herein.  Neither this document nor such discussions are intended to create, nor will either or both be deemed to create, a legally binding or enforceable offer or agreement of any type or nature, unless and until a definitive written agreement is executed and delivered by each of the parties hereto.\n\n\n\nThis document shall be kept confidential pursuant to the terms of the Confidentiality Agreement entered into by the parties and, if applicable, its affiliates with respect to the subject matter hereof.]\n\n\n\n\n\nTABLE OF CONTENTS\n\n\tARTICLE I DEFINITIONS; CERTAIN RULES OF CONSTRUCTION\t2\n\n\t\tSection 1.01\tDefinitions\t2\

In [32]:
retriever.add_documents(chunks)

['e40152a1-a08a-4619-962a-c4f2f7b5ee4d',
 'e1e9a321-ac7b-431e-b13c-5a91be13f160',
 'bb6a09e8-50fd-4846-a987-ab3e8bafe559',
 '1858070f-967b-4158-9c0a-d21a55556867',
 '15729957-4189-4e84-a70e-856fd193c480',
 'e428ef8b-850f-493e-acf6-045424dcf96b',
 'a68b72a8-c9c3-47e5-bdf5-ef042082ba5d',
 '33635e72-71ed-44e6-bc09-6e9452733e36',
 '971d5ef8-1fcc-4eb7-b53b-b9620094e223',
 '9800962c-2b6f-49ef-82c3-b26cc0436b00',
 'aa184072-01c3-4849-8b24-b966192f965b',
 'd223a5c5-56e2-416e-8597-69d3c65d8d6b',
 '02771355-6336-46bf-8001-66caa9399b9d',
 '038679e7-9244-4145-8785-df7f1a66746a',
 '2f67d97e-edbd-4e3d-97b3-accbc3b7867c',
 'db98c728-4ac0-4443-992f-232869129f12',
 'ce795d85-4012-44dd-9b2a-4246edc0c208',
 '46c0ee60-6cb4-4113-827e-d959b7f01f4e',
 '44b6970b-aec1-4ca1-97d9-2a7fcef70c2d',
 '0cc108b9-6549-4016-9983-e2b0113eefbe',
 'b3164207-a7c4-4b9a-bb0b-3f66d4638960',
 'a7c328ce-0580-47db-a91c-a75912066fce',
 'baf45589-1043-4252-bf11-57be8053ea61',
 'f477d93a-c02c-4756-a767-7ccd229dc862',
 'd2a1e7aa-0ef8-

In [33]:
hybrid_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever
)

In [34]:
hybrid_chain.invoke("Who owns the IP?")

{'query': 'Who owns the IP?',
 'result': 'The Company owns all worldwide rights, titles, and interests in and to each item of Company Intellectual Property, except for Intellectual Property exclusively licensed to the Company pursuant to an Inbound IP Contract.'}