# Now first install important libraries

In [1]:
!pip install -qU langchain_community pypdf
!pip install langchain-groq
!pip install sentence_transformers
!pip install faiss-gpu

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 24.8.2 requires cubinlinker, which is not installed.
cudf 24.8.2 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 24.8.2 requires ptxcompiler, which is not installed.
cuml 24.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 24.8.2 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 24.8.2 requires cuda-python<12.0a0,>=11.7.1, but you have cuda-python 12.6.0 which is incompatible.
distributed 2024.7.1 requires dask==2024.7.1, but you have dask 2024.8.1 which is incompatible.
google-cloud-bigquery 2.34.4 requires packaging<22.0dev,>=14.3, but you have packaging 24.1 which is incompatible.
jupyterlab 4.2.4 requires jupyter-lsp>=2.0.0, but you have jupyter-lsp 1.5.1 which is incompatible.
jupyterlab-lsp 5.1.0 requires jupyter-lsp>=2.0.0, but you have jupyter-lsp 1.5.1 w

# Now put your API keys

In [2]:
import os
#os.environ['LANGCHAIN_TRACING_V2'] = 'true'
#os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_PROJECT'] = 'advanced-rag'
os.environ['LANGCHAIN_API_KEY'] = "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"
os.environ['GROQ_API_KEY'] = "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"

# import the important dependencies

In [3]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.load import dumps, loads

## load pdf documents

In [4]:
loader = PyPDFLoader("/kaggle/input/thesis/Thesis.pdf",)
docs = loader.load()

In [5]:
docs

[Document(metadata={'source': '/kaggle/input/thesis/Thesis.pdf', 'page': 0}, page_content='1 \n             Loan -Approval -Prediction using machine learning  \n                                          Final Year Defense Project  \n                                                          Session 2020 -2024  \n \n \n \n \n \nReport submitted in the partial fulfillment  of the requirements for the degree of B.Sc. \nElectrical Engineering  \n \n \nSubmitted by:   \nMr. Syed Umar Raza                             Reg No #20MDELE133  \nMr. Junaid Badshah                             Reg No #20MDELE159  \nMr. Samad Latif                             Reg No #20MDELE177  \n \nSupervised by: Dr. Jawad Ali  \n \n                              Department of Electrical Engineering  \nUniversity of Engineering & Technology, Mardan  \n'),
 Document(metadata={'source': '/kaggle/input/thesis/Thesis.pdf', 'page': 1}, page_content='2                                                           Declaration  \

## split documents into chunking

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

## embeded documents

In [7]:
model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)
vectorstore = FAISS.from_documents(documents=splits, 
                                    embedding=hf_embeddings)

retriever = vectorstore.as_retriever()

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
# Prompt
prompt = hub.pull("rlm/rag-prompt")

  prompt = loads(json.dumps(prompt_object.manifest))


In [9]:
# LLM I use llama3 of 8b parameter
llm = ChatGroq(model="llama3-8b-8192", temperature=0)

In [10]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [11]:
# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [12]:
# Question from your documents
print(rag_chain.invoke("who is the supervisor of  junaid badshah?"))

Dr. Jawad Ali is the supervisor of Junaid Badshah.


In [13]:
print(rag_chain.invoke("enlist me the project mate member"))

Here are the project team members:

* Syed Umar Raza
* Junaid Badshah
* Samad Latif
