In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load the PDF
loader = PyPDFLoader(file_path="DeepSeek_R1.pdf", extract_images=True)
docs = loader.load()  # Ensure the PDF is actually loaded

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)

documents = text_splitter.split_documents(docs)
print(len(documents))


71


In [5]:

from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings

embed = OllamaEmbeddings(
    model="llama3"
)



# Create FAISS vector store
db = FAISS.from_documents(documents, embedding=embed)


In [6]:
db

<langchain_community.vectorstores.faiss.FAISS at 0x14ed66850>

In [12]:
query = "What is Deepseek about ?"
result = db.similarity_search(query)
result



[Document(id='1c8aee45-2531-40b0-a4a0-348778a24b73', metadata={'source': 'DeepSeek_R1.pdf', 'page': 17, 'page_label': '18'}, page_content='H. Lightman, V . Kosaraju, Y. Burda, H. Edwards, B. Baker, T. Lee, J. Leike, J. Schulman,\nI. Sutskever, and K. Cobbe. Let’s verify step by step. arXiv preprint arXiv:2305.20050, 2023.\nB. Y. Lin. ZeroEval: A Unified Framework for Evaluating Language Models, July 2024. URL\nhttps://github.com/WildEval/ZeroEval.\nMAA. American invitational mathematics examination - aime. In American Invitational\nMathematics Examination - AIME 2024, February 2024. URL https://maa.org/math\n-competitions/american-invitational-mathematics-examination-aime .\nOpenAI. Hello GPT-4o, 2024a. URL https://openai.com/index/hello-gpt-4o/.\nOpenAI. Learning to reason with llms, 2024b. URL https://openai.com/index/learnin\ng-to-reason-with-llms/ .\nOpenAI. Introducing SimpleQA, 2024c. URL https://openai.com/index/introducing\n-simpleqa/.\nOpenAI. Introducing SWE-bench verified we

In [8]:
from langchain_ollama.llms import OllamaLLM
llm = OllamaLLM(
    model="llama3"
)

In [9]:
from langchain_core.prompts import ChatPromptTemplate  # type: ignore

prompt = ChatPromptTemplate.from_template("""
Answer the following question based on the provided context.
Think step by step before providing a detailed answer.
I will tip you $1000 if the user finds it useful.
<context>
{context}
</context>
Question: {input}
""")


In [10]:
from langchain.chains.combine_documents import create_stuff_documents_chain

document_chain = create_stuff_documents_chain(llm,prompt)

In [14]:
retriever = db.as_retriever()
retriever

VectorStoreRetriever(tags=['FAISS', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x14ed66850>, search_kwargs={})

In [15]:
from langchain.chains import  create_retrieval_chain
reterival_chain = create_retrieval_chain(retriever , document_chain)

In [19]:
reterival_chain.invoke({"input":""" Why is deepseek better than gpt
 """})

{'input': ' Why is deepseek better than gpt\n ',
 'context': [Document(id='38f543bd-61d4-4fb3-8c64-3b0a2f9d176b', metadata={'source': 'DeepSeek_R1.pdf', 'page': 17, 'page_label': '18'}, page_content='S. Krishna, K. Krishna, A. Mohananey, S. Schwarcz, A. Stambler, S. Upadhyay, and M. Faruqui.\nFact, fetch, and reason: A unified evaluation of retrieval-augmented generation. CoRR,\nabs/2409.12941, 2024. doi: 10.48550/ARXIV.2409.12941. URL https://doi.org/10.485\n50/arXiv.2409.12941.\nA. Kumar, V . Zhuang, R. Agarwal, Y. Su, J. D. Co-Reyes, A. Singh, K. Baumli, S. Iqbal, C. Bishop,\nR. Roelofs, et al. Training language models to self-correct via reinforcement learning. arXiv\npreprint arXiv:2409.12917, 2024.\nH. Li, Y. Zhang, F. Koto, Y. Yang, H. Zhao, Y. Gong, N. Duan, and T. Baldwin. CMMLU: Measur-\ning massive multitask language understanding in Chinese. arXiv preprint arXiv:2306.09212,\n2023.\nT. Li, W.-L. Chiang, E. Frick, L. Dunlap, T. Wu, B. Zhu, J. E. Gonzalez, and I. Stoica. From\