# Installing Dependencies

In [None]:
!pip install transformers==4.46.0
!pip install --quiet langchain==0.3.13 langchain-community==0.3.13 langchain-chroma
!pip install pypdf
!pip install sentence-transformers
!pip install -qU langchain-groq
!pip install langchain_community
!pip install streamlit
!pip install chromadb
!pip install tokenizers==0.20.3
!pip install sentence-transformers

# Retrieving API Keys

In [9]:
import getpass
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()


··········


In [10]:
os.environ["GROQ_API_KEY"] = getpass.getpass()

from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192")


··········


In [4]:
from dotenv import load_dotenv

load_dotenv()
LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')
GROQ_API_KEY = os.getenv('GROQ_API_KEY')

# Handling Imports

In [5]:
from langchain import hub
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.output_parsers import PydanticOutputParser
from langchain.schema.runnable import RunnablePassthrough

# Embedding Document & Storing In Vector DB

In [7]:
document = PyPDFLoader("/content/final.pdf")
pages = document.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)
chunks = text_splitter.split_documents(pages)

hf_embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L6-v2')

vectorstore = Chroma.from_documents(
    documents = chunks,
    embedding = hf_embeddings
)

  hf_embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-MiniLM-L6-v2')
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Retreival

In [13]:
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

query = 'what is the document about?'

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": lambda query: format_docs(retriever.get_relevant_documents(query)), "question": RunnablePassthrough()}
    | prompt
    | llm
    | RunnablePassthrough()
)

output = rag_chain.invoke(query).content
print(output)

The document appears to be a project proposal or design document for a Prison Management System (PMS). It describes the roles of different technologies, including Python as the backend, MySQL as the database, and hashlib, in building this system.
