# langchain官方教程2 建立语义搜索

In [1]:
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader

In [2]:
pdf_file = "./nke-10k-2023.pdf"
loader = PyPDFLoader(
    file_path=pdf_file,
    images_inner_format='markdown-img',
    extract_images=False,
    mode="page"
)
document = loader.load()

In [3]:
len(document)

107

In [4]:
print(document[0].page_content)

Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE FISCAL YEAR ENDED MAY 31, 2023
OR
☐  TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE TRANSITION PERIOD FROM                         TO                         .
Commission File No. 1-10635
NIKE, Inc.
(Exact name of Registrant as specified in its charter)
Oregon 93-0584541
(State or other jurisdiction of incorporation) (IRS Employer Identification No.)
One Bowerman Drive, Beaverton, Oregon 97005-6453
(Address of principal executive offices and zip code)
(503) 671-6453
(Registrant's telephone number, including area code)
SECURITIES REGISTERED PURSUANT TO SECTION 12(B) OF THE ACT:
Class B Common Stock NKE New York Stock Exchange
(Title of each class) (Trading symbol) (Name of each exchange on which registered)
SECURITIES REGISTERED PURSU

In [5]:
print(document[0].metadata)

{'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': './nke-10k-2023.pdf', 'total_pages': 107, 'page': 0, 'page_label': '1'}


In [6]:
from langchain_text_splitters.character import RecursiveCharacterTextSplitter

In [7]:
text_split = RecursiveCharacterTextSplitter(
    separators=[".", "。", " "],
    chunk_size=500,
    chunk_overlap=100,
    add_start_index=True
)

result = text_split.split_documents(document)
result


[Document(metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': './nke-10k-2023.pdf', 'total_pages': 107, 'page': 0, 'page_label': '1', 'start_index': 0}, page_content='Table of Contents\nUNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\nFORM 10-K\n(Mark One)\n☑  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934\nFOR THE FISCAL YEAR ENDED MAY 31, 2023\nOR\n☐  TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934\nFOR THE TRANSITION PERIOD FROM                         TO                         .\nCommission File No. 1-10635\nNIKE,

In [8]:
print(len(result))

1039


In [9]:
print(result[0].page_content)

Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE FISCAL YEAR ENDED MAY 31, 2023
OR
☐  TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE TRANSITION PERIOD FROM                         TO                         .
Commission File No. 1-10635
NIKE, Inc


In [10]:
print(result[1].page_content)

.
Commission File No. 1-10635
NIKE, Inc.
(Exact name of Registrant as specified in its charter)
Oregon 93-0584541
(State or other jurisdiction of incorporation) (IRS Employer Identification No


In [11]:
print(result[3].page_content)

SECTION 12(G) OF THE ACT:
NONE
Indicate by check mark: YES NO
• if the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act


In [12]:
print(result[0].metadata["start_index"])

0


In [13]:
print(result[1].metadata["start_index"])

415


In [14]:
import requests
from langchain_openai import OpenAIEmbeddings

In [15]:
model_name = requests.get("http://localhost:8001/v1/models").json()["data"][0]["id"]

embedding_model = OpenAIEmbeddings(
    base_url="http://localhost:8001/v1",
    api_key="empty",
    model=model_name,
)

In [16]:
embedding_model.embed_query("你好")

[-0.015472412109375,
 0.00437164306640625,
 -0.025909423828125,
 0.0003514289855957031,
 -0.04156494140625,
 -0.0855712890625,
 0.0143585205078125,
 0.040863037109375,
 0.0172271728515625,
 -0.0147857666015625,
 0.04638671875,
 -0.01187896728515625,
 -0.0017328262329101562,
 0.0107421875,
 -0.034576416015625,
 0.023590087890625,
 0.01763916015625,
 -0.0192108154296875,
 -0.0199432373046875,
 -0.029327392578125,
 -0.04840087890625,
 -0.01488494873046875,
 0.00759124755859375,
 -0.02142333984375,
 -0.0029964447021484375,
 0.010223388671875,
 0.051849365234375,
 -0.00595855712890625,
 -0.0257568359375,
 -0.01457977294921875,
 -0.0302886962890625,
 -0.01259613037109375,
 0.024627685546875,
 0.00482940673828125,
 -0.014068603515625,
 -0.00496673583984375,
 -0.017822265625,
 0.0085906982421875,
 -0.03302001953125,
 0.004360198974609375,
 0.01071929931640625,
 0.03192138671875,
 -0.0255279541015625,
 0.0252838134765625,
 -0.027587890625,
 -0.0276031494140625,
 -0.00811767578125,
 0.0256500244

> 这里是不对的  他会使用tiktoken修改你的输入

In [17]:
%pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [18]:
from langchain_openai.embeddings import OpenAIEmbeddings

In [19]:
model_name = requests.get("http://localhost:8001/v1/models").json()["data"][0]["id"]

embedding_model = OpenAIEmbeddings(
    base_url="http://localhost:8001/v1",
    api_key="empty",
    model=model_name,
    tiktoken_enabled=False
)

In [20]:
embedding_model.embed_query("你好")

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


[0.04132080078125,
 -8.249282836914062e-05,
 -0.001316070556640625,
 -0.0025806427001953125,
 -0.032562255859375,
 -0.0205535888671875,
 0.00865936279296875,
 -0.0209503173828125,
 -0.01702880859375,
 -0.0093994140625,
 -0.0029125213623046875,
 -0.0440673828125,
 0.07647705078125,
 -0.04266357421875,
 -0.022308349609375,
 0.0297393798828125,
 0.01082611083984375,
 0.0134735107421875,
 0.037567138671875,
 -0.051361083984375,
 0.06072998046875,
 0.02105712890625,
 -0.005611419677734375,
 -0.0211334228515625,
 0.07171630859375,
 -0.032623291015625,
 0.01068878173828125,
 0.032135009765625,
 -0.0179595947265625,
 0.0165252685546875,
 -0.0053558349609375,
 0.0013446807861328125,
 0.00372314453125,
 0.0251007080078125,
 -0.0703125,
 0.015960693359375,
 0.03021240234375,
 0.007755279541015625,
 0.10577392578125,
 0.045135498046875,
 -0.0281219482421875,
 0.03875732421875,
 0.0169525146484375,
 -0.0063018798828125,
 0.00010037422180175781,
 0.0023632049560546875,
 -0.0011005401611328125,
 -0.0

> 这个是对的 但是不合理

In [21]:
body = {
    "model": "maidalun1020/bce-embedding-base_v1",
    "input": [
        "你好"
    ],
    # "encoding_format": "float",
    # "user": "string",
    # "truncate_prompt_tokens": 1,
    # "additional_data": "string",
    # "add_special_tokens": True,
    # "priority": 0,
    # "additionalProp1": {}
}
print(requests.post("http://localhost:8001/v1/embeddings", json=body).json()["data"][0]["embedding"])

[0.04132080078125, -8.249282836914062e-05, -0.001316070556640625, -0.0025806427001953125, -0.032562255859375, -0.0205535888671875, 0.00865936279296875, -0.0209503173828125, -0.01702880859375, -0.0093994140625, -0.0029125213623046875, -0.0440673828125, 0.07647705078125, -0.04266357421875, -0.022308349609375, 0.0297393798828125, 0.01082611083984375, 0.0134735107421875, 0.037567138671875, -0.051361083984375, 0.06072998046875, 0.02105712890625, -0.005611419677734375, -0.0211334228515625, 0.07171630859375, -0.032623291015625, 0.01068878173828125, 0.032135009765625, -0.0179595947265625, 0.0165252685546875, -0.0053558349609375, 0.0013446807861328125, 0.00372314453125, 0.0251007080078125, -0.0703125, 0.015960693359375, 0.03021240234375, 0.007755279541015625, 0.10577392578125, 0.045135498046875, -0.0281219482421875, 0.03875732421875, 0.0169525146484375, -0.0063018798828125, 0.00010037422180175781, 0.0023632049560546875, -0.0011005401611328125, -0.0018787384033203125, -0.0032806396484375, 0.0276

In [22]:
from openai import OpenAI

In [23]:
client = OpenAI(
    base_url="http://localhost:8001/v1",
    api_key="empty",
)

client.embeddings.create(
    model=model_name,
    input="你好"
)

CreateEmbeddingResponse(data=[Embedding(embedding=[0.04132080078125, -8.249282836914062e-05, -0.001316070556640625, -0.0025806427001953125, -0.032562255859375, -0.0205535888671875, 0.00865936279296875, -0.0209503173828125, -0.01702880859375, -0.0093994140625, -0.0029125213623046875, -0.0440673828125, 0.07647705078125, -0.04266357421875, -0.022308349609375, 0.0297393798828125, 0.01082611083984375, 0.0134735107421875, 0.037567138671875, -0.051361083984375, 0.06072998046875, 0.02105712890625, -0.005611419677734375, -0.0211334228515625, 0.07171630859375, -0.032623291015625, 0.01068878173828125, 0.032135009765625, -0.0179595947265625, 0.0165252685546875, -0.0053558349609375, 0.0013446807861328125, 0.00372314453125, 0.0251007080078125, -0.0703125, 0.015960693359375, 0.03021240234375, 0.007755279541015625, 0.10577392578125, 0.045135498046875, -0.0281219482421875, 0.03875732421875, 0.0169525146484375, -0.0063018798828125, 0.00010037422180175781, 0.0023632049560546875, -0.0011005401611328125, -

In [24]:
response = embedding_model.embed_query(result[0].page_content)
print(response)
print(len(response))

[-0.0011005401611328125, 0.05609130859375, -0.0335693359375, 0.002895355224609375, -0.0134735107421875, 0.0012025833129882812, 0.02105712890625, -0.0308074951171875, -0.0287017822265625, -0.03070068359375, 0.06817626953125, -0.0304107666015625, -0.0240936279296875, -0.01422119140625, -0.006664276123046875, 0.022979736328125, 0.07293701171875, 0.00327301025390625, -0.039581298828125, 0.036590576171875, 0.0211639404296875, 0.0221405029296875, 0.02838134765625, -0.03692626953125, -0.0299530029296875, -0.0002696514129638672, -0.0014095306396484375, 0.0183868408203125, 0.012664794921875, -0.0027980804443359375, -0.052398681640625, -0.004207611083984375, -0.0171051025390625, -0.028717041015625, 0.0352783203125, -0.025634765625, 0.00916290283203125, 0.0170745849609375, -0.032440185546875, 0.01174163818359375, 0.0118255615234375, -0.0010576248168945312, -0.003276824951171875, 0.059539794921875, -0.0301513671875, -0.007221221923828125, -0.0171661376953125, 0.041107177734375, -0.026519775390625,

In [25]:
%pip install langchain-milvus

Note: you may need to restart the kernel to use updated packages.


In [26]:
from langchain_milvus import Milvus

In [27]:
# 启动先先确定是否已经存在这个collection_name
from pymilvus import connections, utility

milvus = connections.connect()
if utility.has_collection("langchain_study"):
    
    utility.drop_collection(collection_name="langchain_study")

In [28]:
# 这个没有会自己创建，如果有则直接使用
uri = "http://localhost:19530"
vector_store = Milvus(
    embedding_function=embedding_model,
    collection_name="langchain_study",
    collection_description="langchain学习使用",
    connection_args={
        "uri": uri
    }
)

In [29]:
# 这个是不存的
vector_store_saved = vector_store.from_documents(result, embedding_model)

In [30]:
vector_store_saved.similarity_search("How many distribution centers does Nike have in the US?")

[Document(metadata={'moddate': '2023-07-20T16:22:08-04:00', 'total_pages': 107, 'keywords': '0000320187-23-000039; ; 10-K', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'page_label': '27', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'title': '0000320187-23-000039', 'page': 26, 'source': './nke-10k-2023.pdf', 'start_index': 1056, 'pk': 457030759203474935}, page_content='.\nIn the United States, NIKE has eight significant distribution centers. Five are located in or near Memphis, Tennessee, two of which are owned and three of which are\nleased. Two other distribution centers, one located in Indianapolis, Indiana and one located in Dayton, Tennessee, are leased and operated by third-party logistics\nproviders. One distribution center for Converse is located in Ontario, California, which is lease

In [31]:
vector_store_saved.similarity_search_with_score("How many distribution centers does Nike have in the US?")


[(Document(metadata={'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'pk': 457030759203474935, 'page_label': '27', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'total_pages': 107, 'source': './nke-10k-2023.pdf', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'creationdate': '2023-07-20T16:22:00-04:00', 'page': 26, 'creator': 'EDGAR Filing HTML Converter', 'start_index': 1056}, page_content='.\nIn the United States, NIKE has eight significant distribution centers. Five are located in or near Memphis, Tennessee, two of which are owned and three of which are\nleased. Two other distribution centers, one located in Indianapolis, Indiana and one located in Dayton, Tennessee, are leased and operated by third-party logistics\nproviders. One distribution center for Converse is located in Ontario, California, which is leas

In [32]:
retriever = vector_store_saved.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={
        "k": 5,
        "score_threshold": 0.5
    }
)

In [33]:
retriever.invoke("How many distribution centers does Nike have in the US?")

No index params provided. Could not determine relevance function. Use L2 distance as default.


[Document(metadata={'keywords': '0000320187-23-000039; ; 10-K', 'source': './nke-10k-2023.pdf', 'creator': 'EDGAR Filing HTML Converter', 'page_label': '27', 'moddate': '2023-07-20T16:22:08-04:00', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creationdate': '2023-07-20T16:22:00-04:00', 'page': 26, 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'title': '0000320187-23-000039', 'total_pages': 107, 'start_index': 1056, 'pk': 457030759203474935}, page_content='.\nIn the United States, NIKE has eight significant distribution centers. Five are located in or near Memphis, Tennessee, two of which are owned and three of which are\nleased. Two other distribution centers, one located in Indianapolis, Indiana and one located in Dayton, Tennessee, are leased and operated by third-party logistics\nproviders. One distribution center for Converse is located in Ontario, California, which is lease

In [34]:
from langchain_openai.chat_models import ChatOpenAI

In [35]:
llm_model_name = requests.get("http://localhost:8000/v1/models").json()["data"][0]["id"]

llm = ChatOpenAI(
    base_url="http://localhost:8000/v1",
    model=llm_model_name,
    api_key="empty"
)

In [36]:
rag_chain = retriever | llm

In [37]:
llm.invoke("How many distribution centers does Nike have in the US?").content

"Nike has a network of distribution centers across the United States to support its operations. The exact number can vary over time due to business needs and strategic decisions. As of the latest available information, Nike operates several major distribution centers in the U.S., but the precise count is not publicly disclosed. To get the most current and accurate number, you might want to check Nike's official corporate website or their annual reports for the most up-to-date information."

In [38]:
uri = "http://localhost:19530"
vector_store = Milvus(
    embedding_function=embedding_model,
    collection_name="langchain_study",
    collection_description="langchain学习使用",
    connection_args={
        "uri": uri
    },
    auto_id=True  # 加这个参数是因为 我add_documents时，需要给id，用这个参数就不用给id了 自动生成
)

In [39]:
vector_store.add_documents(result)

[457030759203477681,
 457030759203477682,
 457030759203477683,
 457030759203477684,
 457030759203477685,
 457030759203477686,
 457030759203477687,
 457030759203477688,
 457030759203477689,
 457030759203477690,
 457030759203477691,
 457030759203477692,
 457030759203477693,
 457030759203477694,
 457030759203477695,
 457030759203477696,
 457030759203477697,
 457030759203477698,
 457030759203477699,
 457030759203477700,
 457030759203477701,
 457030759203477702,
 457030759203477703,
 457030759203477704,
 457030759203477705,
 457030759203477706,
 457030759203477707,
 457030759203477708,
 457030759203477709,
 457030759203477710,
 457030759203477711,
 457030759203477712,
 457030759203477713,
 457030759203477714,
 457030759203477715,
 457030759203477716,
 457030759203477717,
 457030759203477718,
 457030759203477719,
 457030759203477720,
 457030759203477721,
 457030759203477722,
 457030759203477723,
 457030759203477724,
 457030759203477725,
 457030759203477726,
 457030759203477727,
 457030759203

In [40]:
vector_store.search("How many distribution centers does Nike have in the US?", search_type="similarity")

[Document(metadata={'total_pages': 107, 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'page': 26, 'keywords': '0000320187-23-000039; ; 10-K', 'source': './nke-10k-2023.pdf', 'start_index': 1056, 'title': '0000320187-23-000039', 'creationdate': '2023-07-20T16:22:00-04:00', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'moddate': '2023-07-20T16:22:08-04:00', 'page_label': '27', 'creator': 'EDGAR Filing HTML Converter', 'pk': 457030759203478061}, page_content='.\nIn the United States, NIKE has eight significant distribution centers. Five are located in or near Memphis, Tennessee, two of which are owned and three of which are\nleased. Two other distribution centers, one located in Indianapolis, Indiana and one located in Dayton, Tennessee, are leased and operated by third-party logistics\nproviders. One distribution center for Converse is located in Ontario, California, which is lease

In [41]:
uri = "http://localhost:19530"
vector_store = Milvus(
    embedding_function=embedding_model,
    collection_name="langchain_study",
    collection_description="langchain学习使用",
    connection_args={
        "uri": uri
    },
    auto_id=True
)

In [42]:
vector_store.search("How many distribution centers does Nike have in the US?", search_type="similarity")

[Document(metadata={'creator': 'EDGAR Filing HTML Converter', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'title': '0000320187-23-000039', 'start_index': 1056, 'pk': 457030759203478061, 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'creationdate': '2023-07-20T16:22:00-04:00', 'moddate': '2023-07-20T16:22:08-04:00', 'page_label': '27', 'keywords': '0000320187-23-000039; ; 10-K', 'source': './nke-10k-2023.pdf', 'page': 26, 'total_pages': 107}, page_content='.\nIn the United States, NIKE has eight significant distribution centers. Five are located in or near Memphis, Tennessee, two of which are owned and three of which are\nleased. Two other distribution centers, one located in Indianapolis, Indiana and one located in Dayton, Tennessee, are leased and operated by third-party logistics\nproviders. One distribution center for Converse is located in Ontario, California, which is lease