-
Notifications
You must be signed in to change notification settings - Fork 0
/
LoadFVectorize.py
31 lines (27 loc) · 1.22 KB
/
LoadFVectorize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# LoadFVectorize.py
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
# access some online pdf, load, vectorize and commit to disk
def load_doc() -> 'List[Document]':
loader = OnlinePDFLoader("https://support.riverbed.com/bin/support/download?did=7q6behe7hotvnpqd9a03h1dji&version=9.15.0")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)
return docs
# vectorize, commit to disk and create a BM25 retriever
def vectorize(embeddings_model) -> 'FAISS':
docs = load_doc()
db = FAISS.from_documents(docs, embeddings_model)
db.save_local("./opdf915_index")
return db
# attempts to load vectorstore from disk
def load_db() -> 'FAISS':
embeddings_model = HuggingFaceEmbeddings()
try:
db = FAISS.load_local("./opdf915_index", embeddings_model)
except Exception as e:
print(f'Exception: {e}\nNo index on disk, creating new...')
db = vectorize(embeddings_model)
return db