# Vector Databases

In [None]:
!pip install langchain_community faiss-cpu

In [None]:
!pip install langchain_chroma

In [None]:
!pip install pypdf

In [23]:
import faiss
from typing import List
from uuid import uuid4
from google.colab import userdata
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

## Loading Documents

In [14]:
loader = PyPDFLoader("/content/data/amazon-2024-10k.pdf")
pages = loader.load()
len(pages)

94

In [15]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

In [16]:
docs = text_splitter.split_documents(pages)
len(docs)

395

In [18]:
docs[:30]

[Document(metadata={'source': '/content/data/amazon-2024-10k.pdf', 'page': 0}, page_content='Table of Contents\nUNITED STATESSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\n ____________________________________\nFORM 10-K\n____________________________________ \n(Mark One)\n☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the fiscal year ended December 31, 2023\nor\n☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the transition period from            to             .\nCommission File No. 000-22513____________________________________\nAMAZON.COM, INC.\n(Exact name of registrant as specified in its charter)\nDelaware  91-1646860\n(State or other jurisdiction ofincorporation or organization)  (I.R.S. EmployerIdentification No.)\n410 Terry Avenue North\nSeattle, Washington 98109-5210\n(206) 266-1000\n(Address and telephone number, including area code, of registrant’s principal executive of

## FAISS DB

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [8]:
faiss_index = faiss.IndexFlatL2(len(embeddings.embed_query("faiss-index-test")))
faiss_index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7f05f56364f0> >

In [9]:
vars(faiss_index)

{'this': <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7f05f56364f0>}

In [10]:
vector_store_faiss = FAISS(
    embedding_function=embeddings,
    index=faiss_index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [17]:
from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(docs))]
vector_store_faiss.add_documents(documents=docs, ids=uuids)

['9a1ce23f-053c-425d-9ee0-30014055205c',
 'c58e8d41-ddd4-4522-a1a6-140293c0e37c',
 '794b2e23-073f-4465-b11b-149fb88f3d06',
 '86a9a7c0-ddce-4fa0-bc31-acb7179abd44',
 'fb0e1e07-e27b-457f-9070-f25cb608dffd',
 '718a339f-9e90-493f-b4b0-a6e2b0f26827',
 'af935b7f-d519-401d-ade6-9c6893c17a67',
 '713f58be-cb6d-4dc2-9059-4be4e39fdbdf',
 'c77f96d1-1f3f-4edb-8a8c-9843b46d1d72',
 '9290785c-b289-488c-9154-832745f35e73',
 '465b2dd5-8c21-4a95-9704-3076562bf530',
 '1a69b528-c2c6-4078-bf44-61cddd679ab7',
 'abc627c3-dc72-458c-9845-775d59f6b2ca',
 'fb25cabf-b272-4c41-98b5-23294094d250',
 'e639fb3b-f42a-4e4a-8429-baf6532f975d',
 'b0fa57b8-4833-41f3-876f-4b40704e13a2',
 '910a2e19-fc62-442f-ad0a-537f88844e7f',
 '953b4370-9161-4094-ba1d-416461300d78',
 'd42d1748-03f6-48d0-adb1-6a505e9115f4',
 '89aca7b7-4c18-4936-849a-1778f3f121a2',
 '2a6324e8-5c39-4cfb-8a4c-7bb7716415e2',
 'a76b8358-1b8f-4960-8d85-e4961aa79fcd',
 'f794f38e-2f73-40f4-bccf-8ae5d3fc8b42',
 '7d8e1000-5d9e-46b6-aa78-a8edc425d044',
 '0790351b-7211-

In [19]:
results = vector_store_faiss.similarity_search(
    "What are the risks involved in business?",
    k=2
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* discussed below. Many of the risks discussed below also impact our customers, including third-party sellers, which could indirectly have a material adverse
effect on us.
Business and Industry Risks
We Face Intense Competition
Our businesses are rapidly evolving and intensely competitive, and we have many competitors across geographies, including cross-border competition,
and in different industries, including physical, e-commerce, and omnichannel retail, e-commerce services, web and infrastructure computing services,
electronic devices, digital content, advertising, grocery, and transportation and logistics services. Some of our current and potential competitors have greater
resources, longer histories, more customers, and/or greater brand recognition, particularly with our newly-launched products and services and in our newer
geographic regions. They may secure better terms from vendors, adopt more aggressive pricing, and devote more resources to technology, infrastructure, [{'sourc

## Chroma DB

In [24]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [25]:
vector_store_chroma = Chroma(
    collection_name="chroma-test",
    embedding_function=embeddings,
    persist_directory="./chroma",
)

In [26]:
vector_store_chroma.add_documents(documents=docs, ids=uuids)

['9a1ce23f-053c-425d-9ee0-30014055205c',
 'c58e8d41-ddd4-4522-a1a6-140293c0e37c',
 '794b2e23-073f-4465-b11b-149fb88f3d06',
 '86a9a7c0-ddce-4fa0-bc31-acb7179abd44',
 'fb0e1e07-e27b-457f-9070-f25cb608dffd',
 '718a339f-9e90-493f-b4b0-a6e2b0f26827',
 'af935b7f-d519-401d-ade6-9c6893c17a67',
 '713f58be-cb6d-4dc2-9059-4be4e39fdbdf',
 'c77f96d1-1f3f-4edb-8a8c-9843b46d1d72',
 '9290785c-b289-488c-9154-832745f35e73',
 '465b2dd5-8c21-4a95-9704-3076562bf530',
 '1a69b528-c2c6-4078-bf44-61cddd679ab7',
 'abc627c3-dc72-458c-9845-775d59f6b2ca',
 'fb25cabf-b272-4c41-98b5-23294094d250',
 'e639fb3b-f42a-4e4a-8429-baf6532f975d',
 'b0fa57b8-4833-41f3-876f-4b40704e13a2',
 '910a2e19-fc62-442f-ad0a-537f88844e7f',
 '953b4370-9161-4094-ba1d-416461300d78',
 'd42d1748-03f6-48d0-adb1-6a505e9115f4',
 '89aca7b7-4c18-4936-849a-1778f3f121a2',
 '2a6324e8-5c39-4cfb-8a4c-7bb7716415e2',
 'a76b8358-1b8f-4960-8d85-e4961aa79fcd',
 'f794f38e-2f73-40f4-bccf-8ae5d3fc8b42',
 '7d8e1000-5d9e-46b6-aa78-a8edc425d044',
 '0790351b-7211-

In [27]:
results = vector_store_chroma.similarity_search(
    "What are chances of success next year?",
    k=2
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* Table of Contents
Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations
Forward-Looking Statements
This Annual Report on Form 10-K includes forward-looking statements within the meaning of the Private Securities Litigation Reform Act of 1995. All
statements other than statements of historical fact, including statements regarding guidance, industry prospects, or future results of operations or financial
position, made in this Annual Report on Form 10-K are forward-looking. We use words such as anticipates, believes, expects, future, intends, and similar
expressions to identify forward-looking statements. Forward-looking statements reflect management’s current expectations and are inherently uncertain. Actual
results and outcomes could differ materially for a variety of reasons, including, among others, fluctuations in foreign exchange rates, changes in global [{'page': 19, 'source': '/content/data/amazon-2024-10k.pdf'}]
* may differ from the amo