In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain.vectorstores.chroma import Chroma
from langchain_community.embeddings import CohereEmbeddings
from langchain.retrievers import ParentDocumentRetriever
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain import hub
from langchain.storage import InMemoryStore
import tempfile
import re
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
LANGCHAIN_API_KEY = os.environ.get('LANGCHAIN_API_KEY')
COHERE_API_KEY = os.environ.get('COHERE_API_KEY')

In [None]:
class LoadDocuments:
    def __init__(self, data_path):
        self.data_path = data_path

    def load(self):
        """Loads and splits the document into pages."""
        try:
             loader = DirectoryLoader(self.data_path, glob="./*.pdf")
             documents = loader.load()
             return documents
        except Exception as e:
             return f'An error occurred {e}',400

In [None]:
data_loader = LoadDocuments('../data/')

In [None]:
docs = data_loader.load()

In [None]:
docs

In [None]:
class LoadDocuments:
    def __init__(self, uploaded_files):
        self.uploaded_files = uploaded_files  

    def load(self):  
        if self.uploaded_files is not None:
            file  = self.uploaded_files
            documents = []
            
            with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
                tmp_file.write(file.content.tobytes())
                filename = file.name
                if filename.endswith('.pdf'):
                    print(filename)
                    loader = PyMuPDFLoader(tmp_file.name)
                    documents=loader.load_and_split()
                elif filename.endswith('.docx') or filename.endswith('.doc'):
                    loader = UnstructuredWordDocumentLoader(tmp_file.name)
                    documents=loader.load()
                elif filename.endswith('.txt'):
                    loader = TextLoader(tmp_file.name)
                    documents.extend(loader.load())
            return documents
        return None

In [None]:
import ipywidgets as widgets
from IPython.display import display

In [None]:
widgets.FileUpload(
    accept='',  # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
    multiple=False  # True to accept multiple files upload else False
)
uploader = widgets.FileUpload()
display(uploader)

In [None]:
uploaded_file = uploader.value[0]

In [None]:
loader = LoadDocuments(uploaded_file)

In [None]:
docs = loader.load()

In [None]:
docs[0]

In [None]:
docs[0].page_content

In [None]:
class ChunkDocuments:
    def __init__(self, documents):
        self.documents = documents

    def chunk_documents(self,chunk_size=1000, chunk_overlap=50):
        """Splits text into smaller chunks for processing."""
        for page in self.documents:
            page.page_content = self._remove_special_characters(page.page_content)
            page.page_content = re.sub(r'\s+', ' ', page.page_content)

        text_splitter = RecursiveCharacterTextSplitter(
            #separators=["\n\n", "\n", ".", " ", ""],
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            is_separator_regex=False,
        )
        chunks = text_splitter.split_documents(self.documents)
        return chunks
    
    def _remove_special_characters(self, text):
        # Define a regex pattern to match the special characters
        pattern = r'- | \t|●|\n|\[|\]'
        # Use re.sub() to replace matches of the pattern with an empty string
        cleaned_string = re.sub(pattern, '', text)
        return cleaned_string


In [None]:
chunk_documents = ChunkDocuments(docs)

In [None]:
chunks = chunk_documents.chunk_documents()

In [None]:
len(chunks[0].page_content)

In [None]:
chunks[0].page_content

In [None]:
chunks[1].page_content

In [None]:
from langchain.embeddings.base import Embeddings
from sentence_transformers import SentenceTransformer
from typing import List

class SentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model_name: str):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, documents: List[str]) -> List[List[float]]:
        return self.model.encode(documents)

    def embed_query(self, query: str) -> List[float]:
        return self.model.encode([query])[0]

In [None]:


model_name = "all-MiniLM-L6-v2"
embedding = SentenceTransformerEmbeddings(model_name)

In [None]:
embeddings_model = CohereEmbeddings(cohere_api_key=COHERE_API_KEY)

In [None]:
openai_embedding = OpenAIEmbeddings()

In [None]:
vector_store_db = '../data/chroma_db'

In [None]:


vectorStore = Chroma.from_documents(
    documents=chunks,
    collection_name="contract",
    embedding=openai_embedding,
    persist_directory=vector_store_db)

In [None]:
# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorStore,
    docstore=store,
    child_splitter=child_splitter,
)

In [None]:
retriever.add_documents(docs, ids=None)

In [None]:
sub_docs = vectorStore.similarity_search("how much is the retention amount")

In [None]:
sub_docs

In [None]:
print(sub_docs[0].page_content)

In [None]:
retrieved_docs = retriever.get_relevant_documents("How much is the retention amount?")

In [None]:
len(retrieved_docs[0].page_content)

In [None]:
# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=600,chunk_overlap=20)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=20)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="contract", embedding_function=openai_embedding,persist_directory=vector_store_db
)
# The storage layer for the parent documents
store = InMemoryStore()

In [None]:

retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
retriever.add_documents(docs)

In [None]:
system_template = """You are a legal expert tasked with acting as the best lawyer and contract analyzer. Your task is to thoroughly understand the provided context and answer questions related to legal matters, contracts, and relevant laws. If the necessary information is not present in the context use the given context, then get related contexts and answer the question. If the question cannot be answered, respond with "I don't know.".
If the question can be answered as either yes or no, respond with either "Yes," or "No," and include the explanation in your response. In addition, please include the referenced sections in your response.

You must provide accurate responses based solely on the information provided in the context only. Please use the following context only:

### CONTEXT
{context}

### QUESTION
Question: {question}
"""

user_template = "Question:```{question}```"
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template(user_template)
]
qa_prompt = ChatPromptTemplate.from_messages(messages)

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)



def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | qa_prompt
    | llm
    | StrOutputParser()
)

In [None]:
rag_chain.invoke("who owns the IPs")

In [None]:

import sys
sys.path.append('../rag')

In [None]:
from rag_pipeline import RAGPipeline

In [None]:
rag = RAGPipeline(uploaded_file,vector_store_db)

In [None]:
chain = rag.pipeline()

In [None]:
chain.invoke("who owns the IPs")