#  安装DashScope和DashVector Python SDK

In [1]:
!pip3 install dashvector dashscope

Collecting dashvector
  Downloading dashvector-1.0.1-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.3/91.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dashscope
  Downloading dashscope-1.7.2-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dashvector, dashscope
Successfully installed dashscope-1.7.2 dashvector-1.0.1


# 下载数据

In [2]:
!git clone https://github.com/CLUEbenchmark/QBQTC.git
!wc -l QBQTC/dataset/train.json

Cloning into 'QBQTC'...
remote: Enumerating objects: 228, done.[K
remote: Counting objects: 100% (228/228), done.[K
remote: Compressing objects: 100% (157/157), done.[K
remote: Total 228 (delta 93), reused 180 (delta 58), pack-reused 0[K
Receiving objects: 100% (228/228), 10.80 MiB | 11.40 MiB/s, done.
Resolving deltas: 100% (93/93), done.
180000 QBQTC/dataset/train.json


# 利用DashScope Text-Embedding API生成向量

In [3]:
import json

def prepare_data(path):
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            yield data

import dashscope
from dashscope import TextEmbedding

# [Note: get your DashScope API key here first: https://dashscope.console.aliyun.com/apiKey]
dashscope.api_key='YOUR-DASHSCOPE-API-KEY'

def generate_embedding(text):
    # Note: for simplicity, here we get embedding vector one by one,
    # more efficient implementaion may use batch-embedding instead.
    rsp = TextEmbedding.call(model=TextEmbedding.Models.text_embedding_v1,
                             input=text)
    return rsp.output['embeddings'][0]['embedding']

# 查看下embedding向量的维数，后面使用 centaur 检索服务时会用到，目前是1536
print(len(generate_embedding('hello')))

1536


# 向量入库

In [4]:
from dashvector import Client, Doc

# 初始化 DashVector client
# [Note: get your DashVector API key here first: https://dashvector.console.aliyun.com/cn-hangzhou/api-key]
client = Client(api_key='YOUR-DASHSCOPE-API-KEY')

# 指定集合名称和向量维度
# DashScope text_embedding_v1 模型输出的向量维度固定为1536
dimension = 1536
client.create('sample', dimension)

# 写入数据
collection = client.get('sample')
for doc in list(prepare_data('QBQTC/dataset/train.json'))[1:200]:
    title = doc['title']
    rsp = collection.insert(Doc(id=doc['id'], vector=generate_embedding(title),
                                fields={'title': title}))

# 基于向量查询的语义搜索

In [6]:
# 基于向量检索的语义搜索
collection = client.get('sample')
query = '应届生 招聘'
rsp = collection.query(generate_embedding(query), topk = 5, output_fields=['title'])
print('与查询[' + query + '] 语义相近的数据(top 5)：')
for doc in rsp:
    print(f"id: {doc.id}, title: {doc.fields['title']}, score: {doc.score}")

与查询[应届生 招聘] 语义相近的数据(top 5)：
id: 0, title: 实习生招聘-应届生求职网, score: 2523.1582
id: 4412, title: 应届毕业生是什么意思和非应届生有什么不同高三网, score: 3843.3354
id: 12851, title: 实习总结范文-百度经验, score: 6554.0444
id: 25, title: 中国银行2016年校园招聘, score: 6891.6582
id: 8884, title: 中国电子工程设计院招聘信息招聘岗位最新职位信息-智联招聘官网, score: 6956.5229
