#  安装DashScope和DashVector Python SDK

In [1]:
!pip3 install dashvector dashscope

Collecting dashvector
  Downloading dashvector-1.0.1-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.3/91.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dashscope
  Downloading dashscope-1.7.2-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dashvector, dashscope
Successfully installed dashscope-1.7.2 dashvector-1.0.1


# 下载数据

In [1]:
!git clone https://github.com/CLUEbenchmark/QBQTC.git
!wc -l QBQTC/dataset/train.json

Cloning into 'QBQTC'...
remote: Enumerating objects: 228, done.[K
remote: Counting objects: 100% (228/228), done.[K
remote: Compressing objects: 100% (157/157), done.[K
remote: Total 228 (delta 93), reused 180 (delta 58), pack-reused 0[Kobjects:  34% (79/228), 9.84 MiB | 2.82 MiB/sReceiving objects:  36% (83/228), 9.84 MiB | 2.82 MiB/sReceiving objects:  49% (112/228), 9.84 MiB | 2.82 MiB/s
Receiving objects: 100% (228/228), 10.80 MiB | 2.88 MiB/s, done.
Resolving deltas: 100% (93/93), done.
  180000 QBQTC/dataset/train.json


# 利用DashScope Text-Embedding API生成向量

In [None]:
import json

def prepare_data(path, batch_size=25):
    with open(path, 'r', encoding='utf-8') as f:
        batch_docs = []
        for line in f:
            batch_docs.append(json.loads(line.strip()))
            if len(batch_docs) == batch_size:
                yield batch_docs
                batch_docs = []

        if batch_docs:
            yield batch_docs

import dashscope
from dashscope import TextEmbedding

# [Note: get your DashScope API key here first: https://dashscope.console.aliyun.com/apiKey]
dashscope.api_key='YOUR-DASHSCOPE-API-KEY'

def generate_embedding(texts):
    # batch embedding
    rsp = TextEmbedding.call(model=TextEmbedding.Models.text_embedding_v1,
                             input=texts)
    embeddings = [record['embedding'] for record in rsp.output['embeddings']]
    return embeddings if isinstance(texts, list) else embeddings[0]

# 查看下embedding向量的维数，后面使用 DashVector 检索服务时会用到，目前是1536
print(len(generate_embedding('hello')))

# 向量入库

In [16]:
from dashvector import Client, Doc

# 初始化 DashVector client
# [Note: get your DashVector API key here first: https://dashvector.console.aliyun.com/cn-hangzhou/api-key]
client = Client(api_key='YOUR-DASHVECTOR-API-KEY')

# 指定集合名称和向量维度
# DashScope text_embedding_v1 模型输出的向量维度固定为1536
dimension = 1536
client.create('sample', dimension)

# 写入数据
collection = client.get('sample')
for docs in list(prepare_data('QBQTC/dataset/train.json'))[:10]:
    embeddings = generate_embedding([doc['title'] for doc in docs])

    rsp = collection.insert(
        [
            Doc(id=str(doc['id']), vector=embedding, fields={"title": doc['title']})
            for doc, embedding in zip(docs, embeddings)
        ]
    )
    assert rsp

# 基于向量查询的语义搜索

In [18]:
# 基于向量检索的语义搜索
collection = client.get('sample')
query = '应届生 招聘'
rsp = collection.query(generate_embedding(query), topk = 5, output_fields=['title'])
print('与查询[' + query + '] 语义相近的数据(top 5)：')
for doc in rsp:
    print(f"id: {doc.id}, title: {doc.fields['title']}, score: {doc.score}")

与查询[应届生 招聘] 语义相近的数据(top 5)：
id: 0, title: 实习生招聘-应届生求职网, score: 0.2136
id: 25, title: 中国银行2016年校园招聘, score: 0.4792
id: 233, title: 广西环球集团2012招聘广西校园招聘, score: 0.662
id: 107, title: 2013上半年九江事业单位批入围入围面试名单九江中公教育, score: 0.665
id: 104, title: 创意招聘海报-足彩310, score: 0.6678
