In [19]:
from pymilvus import MilvusClient
from pymilvus import connections, MilvusClient, DataType
from pymilvus import AnnSearchRequest

# 云端向量数据库
CLUSTER_ENDPOINT = "https://in03-5cb3b56f3af9ebc.serverless.ali-cn-hangzhou.cloud.zilliz.com.cn"
TOKEN = "9027d285f74e5ce113bf24162fc5cabe04b67db3ee25055f4748ea23785f00d0fa9b8217c108a04dc77c4a703b5860a7d39d7a7b"

client = MilvusClient(
    uri=CLUSTER_ENDPOINT,
    token=TOKEN 
)

In [111]:
client.list_collections()

['rag_demo']

In [None]:
client.drop_collection(collection_name="rag_demo")

## 基础案例

In [3]:
schema = MilvusClient.create_schema(
    auto_id=True,
    enable_dynamic_field=True,
)
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="pdf_title", datatype=DataType.VARCHAR, max_length=128) # pdf的标题
schema.add_field(field_name="page_idx", datatype=DataType.INT16) # pdf页码
schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=512) # pdf语义特征向量

index_params = MilvusClient.prepare_index_params()
index_params.add_index(
    field_name="vector",
    metric_type="COSINE",
    index_type="AUTOINDEX",
    index_name="vector_index"
)

client.create_collection(
    collection_name="rag_demo", 
    schema=schema, 
)

['rag_demo']

In [13]:
for idx in range(100):
    res = client.insert(
        collection_name="rag_demo",
        data=[
            {
                "pdf_title": "test-" + str(idx),
                "page_idx": 0,
                "vector": [0] * 512
            }
        ]
    )

In [14]:
query_vector = [0] * 512

res = client.search(
    collection_name="rag_demo",
    data=[query_vector],
    limit=30,
    filter='pdf_title like "%0"',
    output_fields=["pdf_title"]

)
res

data: ["[{'id': 452810533752390429, 'distance': 0.0, 'entity': {'pdf_title': 'test-0'}}, {'id': 452810533752388407, 'distance': 0.0, 'entity': {'pdf_title': 'test-0'}}, {'id': 452810533752388387, 'distance': 0.0, 'entity': {'pdf_title': 'test-0'}}, {'id': 452810533752388367, 'distance': 0.0, 'entity': {'pdf_title': 'test-0'}}, {'id': 454417515764655219, 'distance': 0.0, 'entity': {'pdf_title': 'test-90'}}, {'id': 454417515763884124, 'distance': 0.0, 'entity': {'pdf_title': 'test-80'}}, {'id': 454417515763884102, 'distance': 0.0, 'entity': {'pdf_title': 'test-70'}}, {'id': 454417515763884082, 'distance': 0.0, 'entity': {'pdf_title': 'test-60'}}, {'id': 454417515764655112, 'distance': 0.0, 'entity': {'pdf_title': 'test-50'}}, {'id': 454417515764655090, 'distance': 0.0, 'entity': {'pdf_title': 'test-40'}}, {'id': 454417515764655070, 'distance': 0.0, 'entity': {'pdf_title': 'test-30'}}, {'id': 454417515764655048, 'distance': 0.0, 'entity': {'pdf_title': 'test-20'}}, {'id': 4544175157646550

In [11]:
res = client.get(
    collection_name="rag_demo",
    ids=[452810533752388407]
)
res

data: ["{'id': 452810533752388407, 'pdf_title': 'test-0', 'page_idx': 0, 'vector': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0

## 进阶案例

In [136]:
client.drop_collection(collection_name="rag_demo")

In [137]:
schema = MilvusClient.create_schema(
    auto_id=True,
    enable_dynamic_field=True,
)
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="page_idx", datatype=DataType.INT16) # pdf页码
schema.add_field(field_name="chunk_content", datatype=DataType.VARCHAR, max_length=1024)
schema.add_field(field_name="chunk_sparse_embedding", datatype=DataType.SPARSE_FLOAT_VECTOR)
schema.add_field(field_name="chunk_embedding", datatype=DataType.FLOAT_VECTOR, dim=512)

index_params = MilvusClient.prepare_index_params()
index_params.add_index(
    field_name="chunk_embedding",
    metric_type="COSINE",
    index_type="AUTOINDEX",
    index_name="vector_index"
)
index_params.add_index(
    field_name="chunk_sparse_embedding",
    index_name="sparse_inverted_index",
    index_type="SPARSE_INVERTED_INDEX",
    metric_type="IP",
    params={"drop_ratio_build": 0.2},
)


client.create_collection(
    collection_name="rag_demo", 
    schema=schema, 
    index_params=index_params
)

In [138]:
from pymilvus.model.sparse.bm25.tokenizers import build_default_analyzer
from pymilvus.model.sparse import BM25EmbeddingFunction
 
# 有一些内置的分析器适用于几种语言，现在我们使用英语'en'。
analyzer = build_default_analyzer(language="en")
 
corpus = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]
 
# 分析器可以将文本分词为标记
tokens = analyzer(corpus[0])
print("tokens:", tokens)

tokens: ['artifici', 'intellig', 'found', 'academ', 'disciplin', '1956']


In [139]:
docs = [
    "机器学习正在改变我们的生活方式。",
    "深度学习在图像识别中表现出色。",
    "自然语言处理是计算机科学的重要领域。",
    "自动驾驶依赖于先进的算法。",
    "AI可以帮助医生诊断疾病。",
    "金融领域广泛应用数据分析技术。",
    "生产效率可以通过自动化技术提高。",
    "机器智能的未来充满潜力。",
    "大数据支持是机器智能发展的关键。",
    "量子隧穿效应使得电子能够穿过经典力学认为无法穿过的势垒，这在半导体器件中有着重要的应用。"
]

# 使用支持中文的分析器
analyzer = build_default_analyzer(language="zh")

# 分析器对文本做分词处理
tokens1 = analyzer(docs[0])
print(tokens1)

['机器', '学习', '改变', '生活', '方式']


In [140]:
# 创建BM25EmbeddingFunction实例，传入分词器，以及其他参数
bm25_ef = BM25EmbeddingFunction(analyzer)

# 计算文档集合的参数
bm25_ef.fit(docs)

In [141]:
result = bm25_ef.encode_queries(["机器学习是未来的趋势"]).reshape(1, -1)

In [142]:
{index: value for index, value in zip(result.indices, result.data)}

{0: 0.76214004, 1: 1.2237754, 30: 1.8458267}

In [143]:
for doc in docs:
    result = bm25_ef.encode_queries([doc]).reshape(1, -1)
    res = client.insert(
        collection_name="rag_demo",
        data=[
            {
                "chunk_content": doc,
                "chunk_sparse_embedding": {index: value for index, value in zip(result.indices, result.data)},
                "page_idx": 0,
                "chunk_embedding": [0.1] * 512
            }
        ]
    )

In [144]:
from pymilvus import (
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection, AnnSearchRequest, RRFRanker, connections,
)

In [148]:
result = bm25_ef.encode_queries(['机器学习']).reshape(1, -1)

sparse_search_params = {"metric_type": "IP"}
sparse_req = AnnSearchRequest([{index: float(value) for index, value in zip(result.indices, result.data)}],
                              "chunk_sparse_embedding", sparse_search_params, limit=2)

dense_search_params = {"metric_type": "COSINE"}
dense_req = AnnSearchRequest([[0.0] * 512],
                             "chunk_embedding", dense_search_params, limit=2)

reqs = [sparse_req, dense_req]

In [149]:
from pymilvus import WeightedRanker

rerank = WeightedRanker(0.8, 0.2)  

In [151]:
result = client.hybrid_search("rag_demo", reqs, ranker=rerank, limit=2)

In [152]:
result

data: ["[{'id': 454417515799994115, 'distance': 0.7858083248138428, 'entity': {}}, {'id': 454417515799994117, 'distance': 0.7500801682472229, 'entity': {}}]"] , extra_info: {'cost': 6}

In [153]:
res = client.get(
    collection_name="rag_demo",
    ids=[454417515799994115]
)
res

data: ["{'chunk_sparse_embedding': {0: 0.7621400356292725, 1: 1.2237753868103027, 2: 1.8458267450332642, 3: 1.8458267450332642, 4: 1.8458267450332642}, 'chunk_embedding': [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1

## RAG案例

In [215]:
import pandas as pd
from tqdm import tqdm
news = pd.read_csv("http://mirror.coggle.club/news-title.txt", sep='\t', header=None)[0].drop_duplicates().values

In [216]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("BAAI/bge-small-zh-v1.5")

In [217]:
client.drop_collection(collection_name="rag_demo")

schema = MilvusClient.create_schema(
    auto_id=True,
    enable_dynamic_field=True,
)
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="chunk_content", datatype=DataType.VARCHAR, max_length=1024)
schema.add_field(field_name="chunk_sparse_embedding", datatype=DataType.SPARSE_FLOAT_VECTOR)
schema.add_field(field_name="chunk_embedding", datatype=DataType.FLOAT_VECTOR, dim=512)

index_params = MilvusClient.prepare_index_params()
index_params.add_index(
    field_name="chunk_embedding",
    metric_type="COSINE",
    index_type="AUTOINDEX",
    index_name="vector_index"
)
index_params.add_index(
    field_name="chunk_sparse_embedding",
    index_name="sparse_inverted_index",
    index_type="SPARSE_INVERTED_INDEX",
    metric_type="IP",
    params={"drop_ratio_build": 0.2},
)


client.create_collection(
    collection_name="rag_demo", 
    schema=schema, 
    index_params=index_params
)

In [218]:
from pymilvus.model.sparse.bm25.tokenizers import build_default_analyzer
from pymilvus.model.sparse import BM25EmbeddingFunction
 
analyzer = build_default_analyzer(language="zh") 
bm25_ef = BM25EmbeddingFunction(analyzer)
bm25_ef.fit(news)

In [219]:
for title in tqdm(news):
    encode = model.encode(title)
    result = bm25_ef.encode_queries([doc]).reshape(1, -1)
    
    res = client.insert(
        collection_name="rag_demo",
        data=[
            {
                "chunk_content": title,
                "chunk_sparse_embedding": {index: value for index, value in zip(result.indices, result.data)},
                "chunk_embedding": encode
            }
        ]
    )

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 489/489 [01:58<00:00,  4.14it/s]


In [220]:
from pymilvus import (
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection, AnnSearchRequest, RRFRanker, connections,
)
from pymilvus import WeightedRanker

query = "最近北京相关的新闻"
result = bm25_ef.encode_queries([query]).reshape(1, -1)

sparse_search_params = {"metric_type": "IP"}
sparse_req = AnnSearchRequest([{index: float(value) for index, value in zip(result.indices, result.data)}],
                              "chunk_sparse_embedding", sparse_search_params, limit=10)

dense_search_params = {"metric_type": "COSINE"}
dense_req = AnnSearchRequest([list(model.encode(query))],
                             "chunk_embedding", dense_search_params, limit=10)

reqs = [sparse_req, dense_req]

rerank = WeightedRanker(0.8, 0.2)  
result = client.hybrid_search("rag_demo", reqs, ranker=rerank, limit=10, output_fields=["chunk_content"])

In [221]:
related_news = "\n".join([x["entity"]["chunk_content"] for x in result[0]])
prompt = f"""请对用户的提问进行回答：{query}

相关资料：{related_news}
"""

In [222]:
print(prompt)

请对用户的提问进行回答：最近北京相关的新闻

相关资料：网友晚间继续供稿 北京夜空被盛大绚丽烟火照亮
百盛在京最后门店将休整 外资第一店不敌新商业
每日易乐:窗外飞过的那辆车 速度太快竟没认出来
马云最后一天上班看望老同事：人气太旺 堪比追星现
中国国家博物馆10月3日至7日对社会正常开放
每日易乐:看什么看 没见过如此炫酷的二八大杠么
每日易乐:预感会有一大波人因错过下车而迟到
香港运输局:港铁出轨事件非常严重 不排除任何可能
牛津、剑桥大学贫富生录取人数相差14倍
被曝出大尺度聊天音频 张天回应：不知情时被录 

