## 相似性分数
****
- 根据相似性打分过滤
- 为文档添加分数

### 相似性分数搜索
***
以Chrom为例

In [1]:
! pip install -qU "langchain-chroma>=0.1.2"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from langchain_openai import OpenAIEmbeddings
import os
embeddings_model = OpenAIEmbeddings(
    model="BAAI/bge-m3",
    api_key=os.environ.get("DEEPSEEK_API_KEY"),
    base_url=os.environ.get("DEEPSEEK_API_BASE")+"/v1",
)

初始化chroma客户端

In [3]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings_model,
    persist_directory="chroma_langchain_db",  # 可选参数，指定持久化目录
)

In [4]:
import chromadb

persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_or_create_collection("collection_name")
collection.add(ids=["1", "2", "3"], documents=["a", "b", "c"])

vector_store_from_client = Chroma(
    client=persistent_client,
    collection_name="collection_name",
    embedding_function=embeddings_model,
)

Add of existing embedding ID: 1
Add of existing embedding ID: 2
Add of existing embedding ID: 3
Insert of existing embedding ID: 1
Insert of existing embedding ID: 2
Insert of existing embedding ID: 3


添加一组文档

In [5]:
from uuid import uuid4

from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
    id=2,
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
    id=3,
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
    id=4,
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
    id=5,
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
    id=6,
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
    id=7,
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
    id=8,
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
    id=9,
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
    id=10,
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)

['19cc65c7-bdad-44fd-8012-35f920e693fe',
 'c6a547e1-833f-4b2d-ad16-69079886464b',
 'd2db9277-98f6-4317-ae8f-a24afa62976c',
 'b0410821-0e3a-47e0-8d4a-59c5d6025440',
 'a59cd9c6-48a6-40ca-a190-2efabbe41c79',
 'da0268aa-431d-4e64-a500-11cea860c6e0',
 '6384556c-1d8f-400e-af4f-932457205c6e',
 'd95a31ac-d5fc-4bbb-ab25-59b9b414a948',
 '9922fb95-702c-4172-9300-f41db3270677',
 '99895a8e-57f3-439f-8261-6f560e37b2bc']

使用相似性分数检索

In [6]:
results = vector_store.similarity_search_with_score(
    "Will it be hot tomorrow?", k=1, filter={"source": "news"}
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

* [SIM=0.837352] Robbers broke into the city bank and stole $1 million in cash. [{'source': 'news'}]


#### 为文档添加分数
****
通过一个自定义链，可以为原始文档增加相关性评分

In [7]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import chain


@chain
def retriever(query: str) -> List[Document]:
    docs, scores = zip(*vector_store.similarity_search_with_score(query))
    for doc, score in zip(docs, scores):
        doc.metadata["score"] = score

    return docs

In [8]:
result = retriever.invoke("Robbers")
result

(Document(id='19cc65c7-bdad-44fd-8012-35f920e693fe', metadata={'source': 'tweet', 'score': 0.7794285171572265}, page_content='I had chocolate chip pancakes and scrambled eggs for breakfast this morning.'),
 Document(id='c92c7a2d-2c76-4064-a5df-bfcb340990aa', metadata={'source': 'tweet', 'score': 0.779443091410297}, page_content='I had chocolate chip pancakes and scrambled eggs for breakfast this morning.'),
 Document(id='99895a8e-57f3-439f-8261-6f560e37b2bc', metadata={'source': 'tweet', 'score': 0.8598275718992796}, page_content='I have a bad feeling I am going to get deleted :('),
 Document(id='a9734ba5-6331-43d0-886f-1c381cbfa123', metadata={'source': 'tweet', 'score': 0.8601369578337088}, page_content='I have a bad feeling I am going to get deleted :('))