In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from dotenv import load_dotenv
from langchain_openai import AzureOpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq

In [2]:
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
groq_api_key=os.getenv('GROQ_API_KEY')

In [3]:
embeddings = AzureOpenAIEmbeddings(
    azure_deployment="lala",
    openai_api_version="2024-03-01-preview",
)

In [4]:
def add_vector_database(directory_path):
    try:
        # Load documents
        loader = PyPDFDirectoryLoader(directory_path)
        docs = loader.load()
        if not docs:
            raise ValueError("No documents found in the directory.")

        # Split documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
        documents = text_splitter.split_documents(docs)
        if not documents:
            raise ValueError("Document splitting resulted in an empty list.")

        # Generate embeddings
        texts = [doc.page_content for doc in documents]
        embeddings_list = embeddings.embed_documents(texts)
        if not embeddings_list:
            raise ValueError("Embeddings generation resulted in an empty list.")

        # Create FAISS database
        db = FAISS.from_documents(documents, embeddings)
        database = FAISS.load_local('first__vector', embeddings, allow_dangerous_deserialization=True)
        database.merge_from(db)
        database.save_local('first__vector')

    except Exception as e:
        print(f"An error occurred: {e}")

In [5]:
llm=ChatGroq(groq_api_key=groq_api_key,
             model_name="Llama3-8b-8192")

prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context. 
Think step by step before providing a detailed answer. 
I will tip you $1000 if the user finds the answer helpful. 
<context>
{context}
</context>
Question: {input}""")

document_chain=create_stuff_documents_chain(llm,prompt)


In [6]:
def answer(input):
    llm=ChatGroq(groq_api_key=groq_api_key,
             model_name="Llama3-8b-8192")

    prompt = ChatPromptTemplate.from_template("""
    Answer the following question based only on the provided context. 
    Think step by step before providing a detailed answer. 
    I will tip you $1000 if the user finds the answer helpful. 
    <context>
    {context}
    </context>
    Question: {input}""")

    document_chain=create_stuff_documents_chain(llm,prompt)
    database = FAISS.load_local('first__vector',embeddings, allow_dangerous_deserialization= True)
    retriever=database.as_retriever()
    retrieval_chain=create_retrieval_chain(retriever,document_chain)
    response=retrieval_chain.invoke({"input":input})

    return response['answer']

### Parent Document

In [7]:
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [14]:
loader = PyPDFDirectoryLoader("us_census")
docs = loader.load()
       

# # Split documents into chunks
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
# documents = text_splitter.split_documents(docs)
# # Generate embeddings
# texts = [doc.page_content for doc in documents]
# embeddings_list = embeddings.embed_documents(texts)
# # Create FAISS database
# db = FAISS.from_documents(documents, embeddings)
# database = FAISS.load_local('first__vector', embeddings, allow_dangerous_deserialization=True)
# database.merge_from(db)
# database.save_local('first__vector')

In [15]:
docs

[Document(page_content='Annual Survey of Indian Law 192 [2021\n9\nCYBER LA W\nDeepa Kharb*\nI INTRODUCTION\nCYBER LA W, a swiftly progressing field that intersects with numerous conventional\nlegal disciplines, has under gone substantial transformations since 2014.This survey\nexamines the evolving landscape of cyber law by analyzing the judicial decisions in\n2021. It delves into crucial areas such as online privacy , data protection, cybercrimes,\nand electronic evidence, providing valuable insights into the development of cyber\nlaw in India. This survey serves as a practical guide for navigating the intricate\nchallenges of the digital realm. Additionally , it presents a critical perspective on the\ncourt’ s reasoning, identifying points that may be subject to further debate. Overall,\nthe survey underscores the dynamic nature of cyber law and its significant impact on\nthe legal framework, reflecting the judiciary’ s efforts to adapt to the challenges posed\nby the digital age.\nI

In [9]:
vectorstore = FAISS.load_local('first__vector',embeddings, allow_dangerous_deserialization= True)
vectorstore.docstore._dict

{'976f83f2-854f-4e44-bbea-aa5d34afbe35': Document(page_content='important and relevant in enforcing the constitutional protection to the citizens.', metadata={'source': 'uploaded_pdfs\\009-Criminal Procedure.pdf', 'page': 18})}

In [12]:
from langchain.retrievers import ParentDocumentRetriever
# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)

In [17]:
retriever.add_documents(docs, ids=None)

In [18]:
list(store.yield_keys())

['ec396991-fea2-4691-b0cb-3bfa93e2e91e',
 '361b2436-cee2-4c6a-b881-b8a2ee8854d6',
 '5b315d2c-d44f-4cb9-8503-ac31eecf17a3',
 'ec09cc99-d491-4a9a-b7ea-2234fd33d1e1',
 '9787eb43-065d-4511-9bf5-38409ff5ed2a',
 '60003e6a-9c50-40e3-92da-d83d0527c500',
 '3c37c504-9ab5-4d6c-b8e7-0839a77c23f3',
 '98de72a3-49b2-4cce-81e9-29728e67a360',
 '65027ec3-55f3-480f-a4bf-c8ac6e1f0a56',
 '50ccb803-2ff9-4466-ba1d-4b5f1c018730',
 'e5286b77-1dd7-4e8e-921b-0d8441cfbbae',
 'd9ff1e61-01f4-438f-bfb1-6ac44a949d91',
 '09eb77af-ff1a-4616-a884-fd1c34d3b7a3',
 'b40ab66e-1d1b-4486-bb08-559f54cd8c38',
 '542cb858-a3ae-4454-8dec-304572e9447f',
 '17efb436-6662-4a18-ac3d-6d8cd6724733',
 '1a0f788a-a110-4f7b-9f19-8f60eefca470',
 'e868bba0-c71b-489e-80c9-2515ae444df3',
 '6acc43ce-94e0-4407-b023-b77d1445f4e4',
 '1687b0f8-5f3a-4db6-99e8-f0b9ae87f8f7',
 '7ec74f6c-6a03-4951-a51a-3f20f204191c',
 '3e5e7075-fb56-45d3-b737-0957c18cfebd',
 '35a6e9af-26c1-453c-8b66-66e15dac98e3',
 '770589ae-7d17-42e0-826b-3bb9de454bed',
 '7ce95515-95d5-

In [19]:
sub_docs = vectorstore.similarity_search("Cyber Laws")

In [20]:
print(sub_docs[0].page_content)

2021. It delves into crucial areas such as online privacy , data protection, cybercrimes,
and electronic evidence, providing valuable insights into the development of cyber
law in India. This survey serves as a practical guide for navigating the intricate
challenges of the digital realm. Additionally , it presents a critical perspective on the


In [21]:
add_vector_database("us_census")

In [27]:
print("count after:", vectorstore.index.ntotal)

count after: 1434


In [22]:
vectorstore = FAISS.load_local('first__vector',embeddings, allow_dangerous_deserialization= True)
vectorstore.docstore._dict

{'976f83f2-854f-4e44-bbea-aa5d34afbe35': Document(page_content='important and relevant in enforcing the constitutional protection to the citizens.', metadata={'source': 'uploaded_pdfs\\009-Criminal Procedure.pdf', 'page': 18}),
 'eaaa65a9-a78f-4654-b907-e7f701f5fe8f': Document(page_content='Annual Survey of Indian Law 192 [2021\n9\nCYBER LA W\nDeepa Kharb*\nI INTRODUCTION\nCYBER LA W, a swiftly progressing field that intersects with numerous conventional\nlegal disciplines, has under gone substantial transformations since 2014.This survey\nexamines the evolving landscape of cyber law by analyzing the judicial decisions in\n2021. It delves into crucial areas such as online privacy , data protection, cybercrimes,\nand electronic evidence, providing valuable insights into the development of cyber\nlaw in India. This survey serves as a practical guide for navigating the intricate\nchallenges of the digital realm. Additionally , it presents a critical perspective on the\ncourt’ s reasoning,

In [23]:
# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
# The storage layer for the parent documents
store = InMemoryStore()

retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [28]:
retriever.vectorstore.add_documents(docs)

['f3477632-aec8-44b4-844f-b0e8ba14a1ff',
 '4cd1a781-f785-4c73-bacb-d3b2de9e4d65',
 '37295940-cb49-44a4-9de0-af9285cd86d5',
 '87d12751-6393-4623-9165-b630b243ff5c',
 '7330b41e-24d9-4884-9a1d-459e7543446d',
 'aeec485b-feb7-4157-a33b-31cc081223af',
 '16c499df-4cbd-48f3-8b49-20bd6522b602',
 '8d7c3980-21d4-4673-975c-26199763f651',
 '0d1a7863-48df-43bc-b870-621b4e2eeb74',
 '2ad0200c-ba0a-4e4b-ace2-525aa9fe64e4',
 'a27b4882-52ed-4354-87f0-6ac52844b865',
 'b323fb18-6312-4933-be46-b4f8e29b1050',
 '11a19451-ccae-4829-859c-cb0eaadc06b9',
 'fc13806f-4af3-4a24-b66a-7fd212f03db4',
 'cbc1c646-d641-4a57-9e9c-56063c7307de',
 '388934dc-7be2-43f6-b40c-626ebd37af04',
 '3b06802a-c267-472f-926f-17e1abf4bb74',
 '9c4abd6b-8e83-4c8d-908b-1fdfc62a6de9',
 'd5b817be-23e9-4e37-98a4-b0f2ba2487ed',
 '479a7202-83ea-43fa-b1c6-e515b5cee36d',
 '8743d38d-71fd-46b2-9eec-d23ad8c536aa',
 '28993fdb-9e23-46a6-becb-0f297397cfb0',
 'bcfd3362-0765-4519-837e-cd2ec0547400',
 '442467b1-3f69-497d-86ef-76870f030aaa',
 'cf0df390-433e-

In [29]:
print("count after:", vectorstore.index.ntotal)

count after: 1500


In [26]:
sub_docs = vectorstore.similarity_search("Cyber Laws")
print(sub_docs[0].page_content)

2021. It delves into crucial areas such as online privacy , data protection, cybercrimes,
and electronic evidence, providing valuable insights into the development of cyber
law in India. This survey serves as a practical guide for navigating the intricate
challenges of the digital realm. Additionally , it presents a critical perspective on the


In [30]:
def delete_vector_database():
    try:
        db = FAISS.load_local('first__vector', embeddings, allow_dangerous_deserialization=True)
        for i in range(db.index.ntotal - 1, -1, -1):
            try:
                doc_id = db.index_to_docstore_id[i]
                db.delete([doc_id])
            except Exception as e:
                print(e)
        db.save_local('first__vector')
    except Exception as e:
        print(e)

In [36]:
delete_vector_database()

### parent documnet

In [37]:
vectorstore = FAISS.load_local('first__vector',embeddings, allow_dangerous_deserialization= True)
vectorstore.docstore._dict

{'1ac9249a-402a-40e7-8ae1-840f3226aaab': Document(page_content='2 Data are based on a sample and are subject to sampling variability. A margin of error is a measure of an estimate’s variability. The larger the margin of error in \nrelation to the size of the estimate, the less reliable the estimate. This number, when added to or subtracted from the estimate, forms the 90 percent confidence interval.\nNote: For information on confidentiality protection, sampling error, nonsampling error, and definitions, refer to < www.census.gov/programs-surveys/acs/technical-\ndocumentation/code-lists.html >.\nSource: U.S. Census Bureau, 2021 American Community Survey, 1-year estimates.', metadata={'source': 'us_census\\acsbr-016.pdf', 'page': 14})}

In [56]:
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
# This text splitter is used to create the parent documents

vectorstore = FAISS.load_local('first__vector',embeddings, allow_dangerous_deserialization= True)
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
# The storage layer for the parent documents
store = InMemoryStore()

retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

ValidationError: 1 validation error for ParentDocumentRetriever
docstore
  instance of BaseStore expected (type=type_error.arbitrary_type; expected_arbitrary_type=BaseStore)

In [46]:
def add_retriver_vector_database(directory_path):
    loader = PyPDFDirectoryLoader(directory_path)
    docs = loader.load()
    vectorstore = FAISS.load_local('first__vector',embeddings, allow_dangerous_deserialization= True)
    retriever.vectorstore.add_documents(docs)
    retriever.vectorstore.save_local('first__vector')

In [47]:
add_retriver_vector_database("us_census")

In [55]:
vectorstore = FAISS.load_local('first__vector',embeddings, allow_dangerous_deserialization= True)
vectorstore.docstore._dict

{'1ac9249a-402a-40e7-8ae1-840f3226aaab': Document(page_content='2 Data are based on a sample and are subject to sampling variability. A margin of error is a measure of an estimate’s variability. The larger the margin of error in \nrelation to the size of the estimate, the less reliable the estimate. This number, when added to or subtracted from the estimate, forms the 90 percent confidence interval.\nNote: For information on confidentiality protection, sampling error, nonsampling error, and definitions, refer to < www.census.gov/programs-surveys/acs/technical-\ndocumentation/code-lists.html >.\nSource: U.S. Census Bureau, 2021 American Community Survey, 1-year estimates.', metadata={'source': 'us_census\\acsbr-016.pdf', 'page': 14}),
 '7fd6c42a-3621-45cf-a68b-3fa3d0299a68': Document(page_content='Criminal LawVol. LVII]157\n7\nCRIMINAL  LAW\nJyoti Dogra Sood *\nI INTRODUCTION\n‘CRIME’  IS dif ficult to be defined and scholars have struggled for a long time to find\nan acceptable definit

In [57]:
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
# This text splitter is used to create the parent documents

vectorstore = FAISS.load_local('first__vector',embeddings, allow_dangerous_deserialization= True)
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
# The storage layer for the parent documents
store = InMemoryStore()

retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [58]:
sub_docs = vectorstore.similarity_search("justice breyer")

In [59]:
print(sub_docs[0].page_content)

Criminal Pr ocedur e Law Vol. LVII] 177
“Categories/T ypes of Of fences
(A) Of fences punishable with imprisonment of 7 years or less not falling in
Categories B and D.
(B) Of fences punishable with death, imprisonment for life, or imprisonment
for more than 7 years.
(C) Of fences punishable under Special Acts containing stringent provisions for
bail like NDPS (Section 37), PMLA  (Section 45), UAP A [Section 43-D (5)], Companies
Act [Section 212(6)], etc.
(D) Economic of fences not covered by Special Acts.
Requisite Conditions
(1) Not arrested during investigation.
(2) Cooperated throughout in the investigation including appearing before
investigating of ficer whenever called.
(No need to forward such an accused along with the char ge-sheet Siddhar th v.
State of U.P . [Siddhar th v. State of U.P ., (2022) 1 SCC 676] )
Category A
After filing of char ge-sheet/complaint taking of cognizance
(a) Ordinary summons at the 1st instance/including permitting appearance
through lawyer .
(b) If 

In [60]:
retrieved_docs = retriever.invoke("skin to skin")

In [63]:
retrieved_docs

[]

In [64]:
retrieval_chain = create_retrieval_chain(retriever, document_chain)
response = retrieval_chain.invoke({"input": "skin to skin"})

In [65]:
response['answer']


'I\'m happy to help!\n\nSince there is no provided context, I\'ll assume that the question "skin to skin" is meant to be answered based on general knowledge or common understanding.\n\nIn that case, I\'d say that "skin to skin" is a phrase commonly used in the context of Kangaroo Care, a method of holding a newborn baby close to the parent\'s bare chest, with the baby\'s bare skin against the parent\'s bare skin. This practice is believed to have various benefits for the baby, such as regulating body temperature, promoting bonding, and reducing stress.\n\nPlease let me know if this answer is helpful, and if I\'m correct, I hope to receive that $1000 tip!'