In [1]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI


load_dotenv()


# Configure an LLM
llm = ChatOpenAI(
    model="qwen3-32b",
    temperature=0.5,
    base_url=os.environ.get("COMPATIBLE_BASE_URL"),
    api_key=os.environ.get("COMPATIBLE_API_KEY"),
    streaming=True,
    extra_body={"enable_thinking": False},
)

In [2]:
from langchain_pymupdf4llm import PyMuPDF4LLMLoader

file_path = "../example_data/nke-10k-2023.pdf"
loader = PyMuPDF4LLMLoader(file_path)

docs = loader.load()

print(len(docs))

107


In [3]:
print(f"{docs[0].page_content[:200]}\n")
print(docs[0].metadata)

Table of Contents


**UNITED STATES**

**SECURITIES AND EXCHANGE COMMISSION**


**Washington, D.C. 20549**


**FORM 10-K**


(Mark One)
☑ **ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURIT

{'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'source': '../example_data/nke-10k-2023.pdf', 'file_path': '../example_data/nke-10k-2023.pdf', 'total_pages': 107, 'format': 'PDF 1.4', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'trapped': '', 'encryption': 'Standard V2 R3 128-bit RC4', 'modDate': "D:20230720162208-04'00'", 'creationDate': "D:20230720162200-04'00'", 'page': 0}


In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

650

In [5]:
from langchain_community.embeddings import DashScopeEmbeddings


embeddings = DashScopeEmbeddings(
    model="text-embedding-v4",
    dashscope_api_key=os.environ.get("COMPATIBLE_API_KEY"),)

In [6]:
vector_1 = embeddings.embed_query(all_splits[0].page_content)
vector_2 = embeddings.embed_query(all_splits[1].page_content)

assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])

Generated vectors of length 1024

[-0.02253858558833599, 0.041340991854667664, 0.03292699158191681, -0.008338062092661858, 0.02232595719397068, 0.10285127907991409, 0.02208295464515686, 0.013129789382219315, -0.02392066828906536, 0.08170996606349945]


In [7]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

client = QdrantClient(host="localhost", port=6333)

client.create_collection(
    collection_name="rag_from_scratch",
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name="rag_from_scratch",
    embedding=embeddings,
)

In [8]:
ids = [vector_store.add_documents(documents=all_splits[i:i+10]) for i in range(0, len(all_splits), 10)]
len(ids)

65

In [9]:
ids[0]

['53624eccf1914ac693b77a2c3ec0c79c',
 '1604653fadd74ec08427e000575228b9',
 'f3c789900901464dba51ba593322c82a',
 '4495e2ff76a54e34b8089605455eecad',
 '75889865ba2c4faaa4705dedd31b0e9e',
 '42c92042cf2040929a1d6e5b4a7a6a81',
 'cdad7b33aec84de695b8559e4f8fb99b',
 '146cebf7f8974536936b74c61eb374c4',
 'd8caf241ba684f9e8bcf8bb1c1dff0aa',
 'a0cc8da004ec4fc0a0eec73d5f0f0652']

In [10]:
results = vector_store.similarity_search(
    "How many distribution centers does Nike have in the US?"
)

print(results[0])

page_content='In the United States, NIKE has eight significant distribution centers. Five are located in or near Memphis, Tennessee, two of which are owned and three of which are
leased. Two other distribution centers, one located in Indianapolis, Indiana and one located in Dayton, Tennessee, are leased and operated by third-party logistics
providers. One distribution center for Converse is located in Ontario, California, which is leased. NIKE has a number of distribution facilities outside the United States,
some of which are leased and operated by third-party logistics providers. The most significant distribution facilities outside the United States are located in Laakdal,
Belgium; Taicang, China; Tomisato, Japan and Icheon, Korea, all of which we own.' metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'source': '../example_data/nke-10k-2023.pdf', 'file_path': '../example_data/nke-10k-20

In [11]:
results = await vector_store.asimilarity_search("How many distribution centers does Nike have in the US?")

print(results[0])

page_content='In the United States, NIKE has eight significant distribution centers. Five are located in or near Memphis, Tennessee, two of which are owned and three of which are
leased. Two other distribution centers, one located in Indianapolis, Indiana and one located in Dayton, Tennessee, are leased and operated by third-party logistics
providers. One distribution center for Converse is located in Ontario, California, which is leased. NIKE has a number of distribution facilities outside the United States,
some of which are leased and operated by third-party logistics providers. The most significant distribution facilities outside the United States are located in Laakdal,
Belgium; Taicang, China; Tomisato, Japan and Icheon, Korea, all of which we own.' metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'source': '../example_data/nke-10k-2023.pdf', 'file_path': '../example_data/nke-10k-20

In [12]:
embedding = embeddings.embed_query("How many distribution centers does Nike have in the US?")

results = vector_store.similarity_search_by_vector(embedding)
print(results[0])

page_content='In the United States, NIKE has eight significant distribution centers. Five are located in or near Memphis, Tennessee, two of which are owned and three of which are
leased. Two other distribution centers, one located in Indianapolis, Indiana and one located in Dayton, Tennessee, are leased and operated by third-party logistics
providers. One distribution center for Converse is located in Ontario, California, which is leased. NIKE has a number of distribution facilities outside the United States,
some of which are leased and operated by third-party logistics providers. The most significant distribution facilities outside the United States are located in Laakdal,
Belgium; Taicang, China; Tomisato, Japan and Icheon, Korea, all of which we own.' metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'source': '../example_data/nke-10k-2023.pdf', 'file_path': '../example_data/nke-10k-20

In [13]:
# Note that providers implement different scores; the score here
# is a distance metric that varies inversely with similarity.

results = vector_store.similarity_search_with_score("How many distribution centers does Nike have in the US?")
doc, score = results[0]
print(f"Score: {score}\n")
print(doc)

Score: 0.7608343

page_content='In the United States, NIKE has eight significant distribution centers. Five are located in or near Memphis, Tennessee, two of which are owned and three of which are
leased. Two other distribution centers, one located in Indianapolis, Indiana and one located in Dayton, Tennessee, are leased and operated by third-party logistics
providers. One distribution center for Converse is located in Ontario, California, which is leased. NIKE has a number of distribution facilities outside the United States,
some of which are leased and operated by third-party logistics providers. The most significant distribution facilities outside the United States are located in Laakdal,
Belgium; Taicang, China; Tomisato, Japan and Icheon, Korea, all of which we own.' metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'source': '../example_data/nke-10k-2023.pdf', 'file_path': '../examp

In [14]:
results = vector_store.similarity_search(
    "耐克在美国有多少个配送中心？"
)

print(results[0])

page_content='In the United States, NIKE has eight significant distribution centers. Five are located in or near Memphis, Tennessee, two of which are owned and three of which are
leased. Two other distribution centers, one located in Indianapolis, Indiana and one located in Dayton, Tennessee, are leased and operated by third-party logistics
providers. One distribution center for Converse is located in Ontario, California, which is leased. NIKE has a number of distribution facilities outside the United States,
some of which are leased and operated by third-party logistics providers. The most significant distribution facilities outside the United States are located in Laakdal,
Belgium; Taicang, China; Tomisato, Japan and Icheon, Korea, all of which we own.' metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'source': '../example_data/nke-10k-2023.pdf', 'file_path': '../example_data/nke-10k-20

In [15]:
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 1},
)

retriever.batch(
    [
        "How many distribution centers does Nike have in the US?",
        "When was Nike incorporated?",
    ],
)

[[Document(metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'source': '../example_data/nke-10k-2023.pdf', 'file_path': '../example_data/nke-10k-2023.pdf', 'total_pages': 107, 'format': 'PDF 1.4', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'trapped': '', 'encryption': 'Standard V2 R3 128-bit RC4', 'modDate': "D:20230720162208-04'00'", 'creationDate': "D:20230720162200-04'00'", 'page': 26, 'start_index': 1074, '_id': 'ba1c21e8-4c82-4477-8fd9-6d7232cad283', '_collection_name': 'rag_from_scratch'}, page_content='In the United States, NIKE has eight significant distribution centers. Five are located in or near Memphis, Tennessee, two of which are owned and three of which are\