# Simple RAG Pipeline

In [None]:
## Import getpass to create propmt to add env variables

import getpass
import os
from dotenv import load_dotenv
load_dotenv()

## Loaders

### Text Loader

In [None]:
## Import text loader 
from langchain_community.document_loaders import TextLoader

loader = TextLoader('docs\Maharana_PratapJi.txt')

text = loader.load()

text

In [None]:
text[0].page_content

### PDF  Loader 

In [None]:
from langchain_community.document_loaders import PyPDFLoader

pdfloader = PyPDFLoader('docs\attention.pdf')

pdf_text = pdfloader.load()

len(pdf_text)

In [None]:
pdf_text[2].json()

## Vector Store and Retrival

In [None]:
## RecursiveCharacterTextSplitter will split test into small chunks due to llm has less context size.

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter( chunk_size=512,chunk_overlap=50)

# Conveting TXT text to chunk
# chunk_text = text_splitter.split_documents(text)
# chunk_text[:5]

# Conveting PDF text to chunk
chunk_pdf = text_splitter.split_documents(pdf_text)
chunk_pdf[:5]

In [None]:
chunk_pdf

### Adding Cohere Embedder

In [8]:
## we have chunks, we need to ADD those chunct into vector database so we need to have vector embedding first then we can insert the record into cromadb.

if not os.getenv("COHERE_API_KEY"):
    os.environ["COHERE_API_KEY"] = getpass.getpass()

In [None]:
from langchain_cohere import CohereEmbeddings

embeddings = CohereEmbeddings(
    model="embed-english-v3.0",
)

In [None]:
import xyz

### ChromaDB

In [None]:
from langchain_chroma import Chroma

vector_store = Chroma(
    chunk_pdf,
    embedding_function=embeddings,
)

In [None]:
from uuid import uuid4

# uuids = [str(uuid4()) for _ in range(len(chunk_text))] + [str(uuid4()) for _ in range(len(chunk_pdf))] 
# uuids = [str(uuid4()) for _ in range(len(chunk_pdf))] 
# vector_store.add_documents(documents=chunk_text, ids=uuids)
vector_store.add_documents(documents=chunk_pdf, ids=uuids)

### Retrieval

In [None]:
results = vector_store.similarity_search(
    "What was the last war of Maharana Pratap?",
    k=2
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

In [None]:
results = vector_store.similarity_search(
    "What Fixed Income Instruments?",
    k=2
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")