In [1]:
from langchain_core.documents import Document

documents = [
	Document(
	page_content="Dogs are great companions, known for their loyalty and friendliness.",
	metadata={"source": "mammal-pets-doc"},
	),
	Document(
		page_content="Cats are independent pets that often enjoy their own space.",
		metadata={"source": "mammal-pets-doc"},
	),
]

In [2]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "./nke-10k-2023.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

107


In [3]:
print(f"{docs[0].page_content[:200]}\n")
print(docs[0].metadata)

Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
F

{'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': './nke-10k-2023.pdf', 'total_pages': 107, 'page': 0, 'page_label': '1'}


In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
  chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

516

In [8]:
print(len(all_splits[0].page_content))
print(all_splits[0].metadata)

972
{'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': './nke-10k-2023.pdf', 'total_pages': 107, 'page': 0, 'page_label': '1', 'start_index': 0}


In [9]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

In [10]:
vector_1 = embeddings.embed_query(all_splits[0].page_content)
vector_2 = embeddings.embed_query(all_splits[1].page_content) 

assert len(vector_1) == len(vector_2)
print(f"Generate vector of length {len(vector_1)}\n")
print(vector_1[:10])

Generate vector of length 3072

[-0.0014180047437548637, 0.0006364254513755441, 0.0023240740410983562, -0.028028815984725952, -0.006018347572535276, 0.007032659370452166, 0.008973612450063229, -0.009234011173248291, -0.0018726892303675413, 0.02026733197271824]


In [11]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [12]:
ids = vector_store.add_documents(documents=all_splits)

GoogleGenerativeAIError: Error embedding content: 429 Resource has been exhausted (e.g. check quota).

In [13]:
results = vector_store.similarity_search(
    "How many distribution centers does Nike have in the US?"
)

print(results[0])

IndexError: list index out of range

In [14]:
results = await vector_store.asimilarity_search("When was Nike incorporated?")

print(results[0])

IndexError: list index out of range

In [None]:
embedding = embeddings.embed_query("How were Nike's margins impacted in 2023?")

results = vector_store.similarity_search_by_vector(embedding)
print(results[0])