<a href="https://colab.research.google.com/github/chakrateja70/RAG-AGENT/blob/main/Agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain_openai langchain langchain_chroma docx2txt langchain_community langchain_text_splitters pypdf weaviate-client

In [None]:
import langchain
print(langchain.__version__)

**Required API Keys**


In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = "api_key"
os.environ['OPENAI_API_KEY'] = "api_key"


# *Loading PDF*

In [None]:
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
from langchain_core.documents import Document
import os

def load_documents(folder_path: str) -> List[Document]:
    pages = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        else:
            print(f"Unsupported file type: {filename}")
            continue
        pages.extend(loader.load())
    return pages

folder_path = "/content/"
pages = load_documents(folder_path)
print(f"Loaded {len(pages)} documents from the folder.")


In [None]:
# from langchain_community.document_loaders import PyPDFLoader

# loader = PyPDFLoader("/content/docs/NLP.pdf")
# pages = []
# async for page in loader.alazy_load():
#     pages.append(page)



In [None]:
pages

# *splitting text from above loaded PDF*

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=3500,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)
splits = text_splitter.split_documents(pages)
print(f"split the document into  {len(splits)} chunks.")

In [None]:
print(1, splits[2].page_content)

# *Embedding the Chunk of documents*

In [None]:
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
document_embeddings = embedding_function.embed_documents([split.page_content for split in splits])


In [None]:
document_embeddings

# *Storing data in chroma*

In [None]:
from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings

# Define embedding model
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Define Chroma vector database
vectordb = Chroma.from_documents(
    documents=splits,  # Ensure `splits` contains the list of document chunks
    embedding=embedding_function,
    persist_directory="./chroma.db"  # Specify directory to persist the database
)

print("Vector store created and persisted to './chroma.db'")


# *Performing Similarity Search*

In [None]:

query = "what is oops?"
search_results = vectordb.similarity_search(query, k=5)
print(f"\nTop 2 most relevant chunks for the query: '{query}'\n")
for i, result in enumerate(search_results, 1):
    print(f"Result {i}:")
    print(f"Source: {result.metadata.get('source', 'Unknown')}")
    print(f"Content: {result.page_content}")
    print()


*Retrival of relavant data*

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})
retriever_results = retriever.invoke("what is nxtwave?")
print(retriever_results)
