In [7]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate

from langchain_classic.chains import RetrievalQA


In [4]:
import os
# 设置http的代理和不代理的地址
os.environ['HTTP_PROXY'] = "http://127.0.0.1:7890"
os.environ['HTTPS_PROXY'] = "http://127.0.0.1:7890"
os.environ['NO_PROXY'] = "http://127.0.0.1:11434" #ollama的本地服务地址

## Read the PDFs from the folder

In [2]:
loader=PyPDFDirectoryLoader("papers")
documents=loader.load()

In [3]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
final_documents=text_splitter.split_documents(documents)
final_documents[0]
len(final_documents)

Document(metadata={'producer': 'pdfTeX-1.40.21', 'creator': 'LaTeX with hyperref', 'creationdate': '2021-06-02T00:38:11+00:00', 'author': '', 'keywords': '', 'moddate': '2021-06-02T00:38:11+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'papers/Paper4.pdf', 'total_pages': 44, 'page': 0, 'page_label': '1'}, page_content='Diffusion Models Beat GANs on Image Synthesis\nPrafulla Dhariwal∗\nOpenAI\nprafulla@openai.com\nAlex Nichol∗\nOpenAI\nalex@openai.com\nAbstract\nWe show that diffusion models can achieve image sample quality superior to the\ncurrent state-of-the-art generative models. We achieve this on unconditional im-\nage synthesis by ﬁnding a better architecture through a series of ablations. For\nconditional image synthesis, we further improve sample quality with classiﬁer guid-\nance: a simple, compute-efﬁcient method for trading off diversity for ﬁdelit

345

## Embedding Using Huggingface

In [8]:
import os
huggingface_embeddings= HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5", #sentence-transformers/all-MiniLM-16-v2
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

# from langchain_community.embeddings import OllamaEmbeddings
# huggingface_embeddings = OllamaEmbeddings(model="bge-m3:latest")


In [9]:
import numpy as np
np.array(huggingface_embeddings.embed_query(final_documents[0].page_content))

array([-5.28379381e-02, -2.95848940e-02, -1.65011603e-02, -6.97876606e-03,
        4.93186377e-02,  3.56690958e-02, -7.25341365e-02, -6.28811345e-02,
       -3.42325168e-03,  2.99070086e-02,  1.10447528e-02, -3.16040367e-02,
        3.81746106e-02,  5.44714481e-02,  1.93874687e-02,  7.04315081e-02,
       -8.44008289e-03, -1.88588444e-02,  4.77212183e-02, -1.40712420e-02,
        1.00122914e-02, -4.74376492e-02,  4.91017736e-02, -3.02354526e-02,
        6.28539827e-03,  8.31162371e-03,  7.16581494e-02, -6.14517853e-02,
       -1.80430245e-02, -2.34286398e-01,  4.32564057e-02,  1.20113594e-02,
        4.18742485e-02, -3.28857899e-02, -2.13135747e-04, -6.92436355e-04,
       -2.48747766e-02, -2.50455774e-02, -2.86248066e-02,  4.37693819e-02,
       -2.68596802e-02,  1.10380836e-02, -6.44884631e-02, -2.78332494e-02,
        4.22788821e-02, -1.22138187e-02, -5.35549149e-02, -1.87554117e-02,
       -5.26116565e-02, -1.19410744e-02,  5.90018113e-04, -5.63122295e-02,
       -1.46162538e-02,  

In [10]:
final_documents[0].__dict__

{'id': None,
 'metadata': {'producer': 'pdfTeX-1.40.21',
  'creator': 'LaTeX with hyperref',
  'creationdate': '2021-06-02T00:38:11+00:00',
  'author': '',
  'keywords': '',
  'moddate': '2021-06-02T00:38:11+00:00',
  'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2',
  'subject': '',
  'title': '',
  'trapped': '/False',
  'source': 'papers/Paper4.pdf',
  'total_pages': 44,
  'page': 0,
  'page_label': '1'},
 'page_content': 'Diffusion Models Beat GANs on Image Synthesis\nPrafulla Dhariwal∗\nOpenAI\nprafulla@openai.com\nAlex Nichol∗\nOpenAI\nalex@openai.com\nAbstract\nWe show that diffusion models can achieve image sample quality superior to the\ncurrent state-of-the-art generative models. We achieve this on unconditional im-\nage synthesis by ﬁnding a better architecture through a series of ablations. For\nconditional image synthesis, we further improve sample quality with classiﬁer guid-\nance: a simple, compute-efﬁcient metho

## VectorStore Creation


In [11]:
vectorstore=FAISS.from_documents(final_documents,huggingface_embeddings)

## Query

In [12]:
vectorstore.similarity_search_with_score("What is the difference between a function and a method?",k=2)

[(Document(id='0f602e60-ee99-4149-afa6-ccaf0dd07252', metadata={'producer': 'pdfTeX-1.40.12', 'creator': 'LaTeX with hyperref package', 'creationdate': '2015-11-20T01:16:03+00:00', 'author': 'Jascha Sohl-Dickstein, Eric A. Weiss, Niru Maheswaranathan, Surya Ganguli', 'keywords': 'generative models, nonequilibrium thermodynamics, unsupervised learning', 'moddate': '2024-04-11T10:04:00+05:30', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-2.3-1.40.12 (TeX Live 2011) kpathsea version 6.0.1', 'subject': 'Proceedings of the International Conference on Machine Learning 2015', 'title': 'Deep Unsupervised Learning using Nonequilibrium Thermodynamics', 'trapped': '/False', 'source': 'papers/Paper1.pdf', 'total_pages': 18, 'page': 9, 'page_label': '10'}, page_content='Computing, January 2001.\nOzair, S. and Bengio, Y . Deep Directed Generative Au-\ntoencoders. arXiv:1410.0630, October 2014.\nParry, M., Dawid, A. P., Lauritzen, S., and Others. Proper\nlocal scoring rules. The Annals of St