In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
import getpass
import os

# Prompt the user to enter the OpenAI API key if it's not already set in the environment variables
if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

In [2]:
# Define the path to the PDF document to be loaded
file_path = "../docs/nke-10k-2023.pdf"

# Load the PDF document using PyPDFLoader
loader = PyPDFLoader(file_path)
docs = loader.load()
print(f"Number of pages loaded: {len(docs)}")



Number of pages loaded: 107


In [3]:
# Split the document into smaller chunks using RecursiveCharacterTextSplitter
# This helps in processing large documents by breaking them into manageable pieces
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)
print(f"Number of chunks created: {len(all_splits)}")



Number of chunks created: 516


In [4]:
# Initialize the OpenAIEmbeddings model for generating text embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Create an in-memory vector store to store the document embeddings
vector_store = InMemoryVectorStore(embeddings)

# Add the document chunks to the vector store and generate their embeddings
ids = vector_store.add_documents(documents=all_splits)

In [5]:
# Perform a similarity search to find the most relevant document chunk for the given query
results = vector_store.similarity_search(
    "How many distribution centers does Nike have in the US?"
)
print(f"Most relevant document chunk: {results[0]}")



Most relevant document chunk: page_content='operations. We also lease an office complex in Shanghai, China, our headquarters for our Greater China geography, occupied by employees focused on implementing our
wholesale, NIKE Direct and merchandising strategies in the region, among other functions.
In the United States, NIKE has eight significant distribution centers. Five are located in or near Memphis, Tennessee, two of which are owned and three of which are
leased. Two other distribution centers, one located in Indianapolis, Indiana and one located in Dayton, Tennessee, are leased and operated by third-party logistics
providers. One distribution center for Converse is located in Ontario, California, which is leased. NIKE has a number of distribution facilities outside the United States,
some of which are leased and operated by third-party logistics providers. The most significant distribution facilities outside the United States are located in Laakdal,' metadata={'producer': 'EDGRpdf 

In [6]:
# Perform a similarity search with score to get the relevance score along with the document chunk
results = vector_store.similarity_search_with_score("What was Nike's revenue in 2023?")
doc, score = results[0]
print(f"Relevance Score: {score}\n")
print(f"Document chunk: {doc}")



Relevance Score: 0.6886661237334675

Document chunk: page_content='Table of Contents
FISCAL 2023 NIKE BRAND REVENUE HIGHLIGHTSThe following tables present NIKE Brand revenues disaggregated by reportable operating segment, distribution channel and major product line:
FISCAL 2023 COMPARED TO FISCAL 2022
• NIKE, Inc. Revenues were $51.2 billion in fiscal 2023, which increased 10% and 16% compared to fiscal 2022 on a reported and currency-neutral basis, respectively.
The increase was due to higher revenues in North America, Europe, Middle East & Africa ("EMEA"), APLA and Greater China, which contributed approximately 7, 6,
2 and 1 percentage points to NIKE, Inc. Revenues, respectively.
• NIKE Brand revenues, which represented over 90% of NIKE, Inc. Revenues, increased 10% and 16% on a reported and currency-neutral basis, respectively. This
increase was primarily due to higher revenues in Men's, the Jordan Brand, Women's and Kids' which grew 17%, 35%,11% and 10%, respectively, on a wholesal

In [7]:
# Create a retriever from the vector store for batch processing of multiple queries
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 1},
)

# Perform batch retrieval for multiple queries
batch_results = retriever.batch(
    [
        "How many distribution centers does Nike have in the US?",
        "When was Nike incorporated?",
    ],
)
print(f"Batch retrieval results: {batch_results}")

Batch retrieval results: [[Document(id='4fe49eba-99ce-4d81-80ad-c40c587df2cc', metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': '../docs/nke-10k-2023.pdf', 'total_pages': 107, 'page': 26, 'page_label': '27', 'start_index': 804}, page_content='operations. We also lease an office complex in Shanghai, China, our headquarters for our Greater China geography, occupied by employees focused on implementing our\nwholesale, NIKE Direct and merchandising strategies in the region, among other functions.\nIn the United States, NIKE has eight significant distribution centers. Five are located in or near Memphis, Tennessee, two of w