##  安装DashScope和DashVector Python SDK

In [2]:
# install required packages
!pip install dashvector dashscope



## 下载数据

In [4]:
# prepare news corpus as knowledge source
!git clone https://github.com/shijiebei2009/CEC-Corpus.git

Cloning into 'CEC-Corpus'...
remote: Enumerating objects: 1278, done.[K
remote: Total 1278 (delta 0), reused 0 (delta 0), pack-reused 1278[K
Receiving objects: 100% (1278/1278), 1.19 MiB | 4.10 MiB/s, done.
Resolving deltas: 100% (371/371), done.


## 配置DashScope API key以及初始化向量数据库DashVector

In [3]:
import dashscope
import os
from dashscope import TextEmbedding
from dashvector import Client, Doc

# [Note: get your DashScope API key here first: https://dashscope.console.aliyun.com/apiKey]
dashscope.api_key='YOUR-DASHSCOPE-API-KEY'


# 初始化 DashVector client
# [Note: get your DashVector API key here first: https://dashvector.console.aliyun.com/cn-hangzhou/api-key]
dashvector_client = Client(api_key='YOUR-DASHVECTOR-API-KEY')

# define collection name
collection_name = 'news_embeddings'

# delete if already exist
dashvector_client.delete(collection_name)

# create a collection with embedding size of 1536
collection_dimension = 1536
rsp = dashvector_client.create(collection_name, collection_dimension)
collection = dashvector_client.get(collection_name)


## 数据读取

In [5]:
def prepare_data_from_dir(path, size):
    # prepare the data from a file folder in order to upsert to DashVector with a reasonable doc's size.
    batch_docs = []
    for file in os.listdir(path):
        with open(path + '/' + file, 'r', encoding='utf-8') as f:
            batch_docs.append(f.read())
            if len(batch_docs) == size:
                yield batch_docs[:]
                batch_docs.clear()

    if batch_docs:
        yield batch_docs

In [6]:
def prepare_data_from_file(path, size):
    # prepare the data from file in order to upsert to DashVector with a reasonable doc's size.
    batch_docs = []
    chunk_size = 12
    with open(path, 'r', encoding='utf-8') as f:
        doc = ''
        count = 0
        for line in f:
            if count < chunk_size and line.strip() != '':
                doc += line
                count += 1
            if count == chunk_size:
                batch_docs.append(doc)
                if len(batch_docs) == size:
                    yield batch_docs[:]
                    batch_docs.clear()
                doc = ''
                count = 0

    if batch_docs:
        yield batch_docs

## 利用Embedding API生成文本向量

In [7]:
def generate_embeddings(docs):
    # create embeddings via DashScope's TextEmbedding model API
    rsp = TextEmbedding.call(model=TextEmbedding.Models.text_embedding_v1,
                             input=docs)
    embeddings = [record['embedding'] for record in rsp.output['embeddings']]
    return embeddings if isinstance(docs, list) else embeddings[0]

## 向量入库(使用batch 插入方式进入DashVector)

In [8]:
# create embeddings and insert them into DashVector.
# Note: this may take a while to run.

id = 0
dir_name = 'CEC-Corpus/raw corpus/allSourceText'

# indexing the raw docs with index to DashVector
collection = dashvector_client.get(collection_name)

# choose batch size of 20
batch_size = 20

for news in list(prepare_data_from_dir(dir_name, batch_size)):
    ids = [id + i for i, _ in enumerate(news)]
    id += len(news)
    # generate embedding from raw docs
    vectors = generate_embeddings(news)
    # upsert and index
    ret = collection.upsert(
        [
            Doc(id=str(id), vector=vector, fields={"raw": doc})
            for id, doc, vector in zip(ids, news, vectors)
        ]
    )
    print(ret)


{"code": 0, "message": "Success", "requests_id": "30471040-aebb-49ee-a77f-ca833975e5a0"}
{"code": 0, "message": "Success", "requests_id": "166094f7-70fd-4bf9-8eac-cd575db068ec"}
{"code": 0, "message": "Success", "requests_id": "b625db59-c937-4bad-897f-c5e33a1cdd86"}
{"code": 0, "message": "Success", "requests_id": "b984aa46-625d-42f9-b649-1eb3bfa938d5"}
{"code": 0, "message": "Success", "requests_id": "693c28c3-924a-42a6-8ce4-2b1911e86e9c"}
{"code": 0, "message": "Success", "requests_id": "1de5b46b-dad4-4859-ab0f-5090281cc709"}
{"code": 0, "message": "Success", "requests_id": "fa3e95f9-a5fc-4aae-a2b0-cff157c3b7df"}
{"code": 0, "message": "Success", "requests_id": "b3a6d8a1-db05-4b0a-8312-80d836ff0429"}
{"code": 0, "message": "Success", "requests_id": "e5748bc4-ec93-4e6c-a99f-c7cb2b8ea54f"}
{"code": 0, "message": "Success", "requests_id": "ad203598-f416-4a93-a7bd-4c38cfe37bc2"}
{"code": 0, "message": "Success", "requests_id": "477fd992-5c7d-4107-a5c2-7b0c874719f7"}
{"code": 0, "message"

In [9]:
# check the collection status
collection = dashvector_client.get(collection_name)
rsp = collection.stats()
print(rsp)

{"code": 0, "message": "Success", "requests_id": "24d49ebf-179f-44d7-97ea-ba93791b3f83", "output": {"total_doc_count": 332, "index_completeness": 1.0, "partitions": {"default": {"total_doc_count": 332}}}}


## 向量检索(知识库检索)

In [10]:
def search_relevant_context(question, topk=1, client=dashvector_client):
    # query and recall the relevant information
    collection = client.get(collection_name)

    # recall the top k similarity results from DashVector
    rsp = collection.query(generate_embeddings(question), output_fields=['raw'],
                           topk=topk)
    return "".join([item.fields['raw'] for item in rsp.output])

## 基于LLM的问答

In [11]:
import dashscope
import textwrap
from dashscope import Generation

# define a prompt template for the vectorDB-enhanced LLM generation
def answer_question(model_name, question, context):
    prompt = f'''请基于```内的内容回答问题。"
	```
	{context}
	```
	我的问题是：{question}。
    '''
    response = Generation.call(
      model= model_name,
      prompt=prompt
    )
    # print(prompt)
    return response.output['text']

## 原生LLM问题问答（无知识库检索增强）

In [12]:
# test Q&A on plain LLM without vectorDB enhancement
model_name = 'qwen-7b-chat-v1'
question = '海南安定追尾事故，发生在哪里？原因是什么？人员伤亡情况如何？'
answer = answer_question(model_name, question, '')
print(f'问题: {question}\n' f'回答: {textwrap.fill(answer, width=50)}')

问题: 海南安定追尾事故，发生在哪里？原因是什么？人员伤亡情况如何？
回答: 很抱歉，我无法提供关于该事故的最新信息。请您查阅可靠的新闻来源以获取最新信息。


## 基于知识库的LLM问题问答（利用DashVector的检索增强）

In [13]:
# test Q&A with knowledge enhancement through DashVector
context = search_relevant_context(question, topk=2)
answer = answer_question(model_name, question, context)
print(f'问题: {question}\n' f'回答: {textwrap.fill(answer, width=50)}')

问题: 海南安定追尾事故，发生在哪里？原因是什么？人员伤亡情况如何？
回答: 海南安定追尾事故发生在海南省定安县境内，环岛东线高速公路海口往三亚方向53公里处。原因是琼AB711
9小轿车驾驶人追尾所致。该事故造成小轿车人员5人当场死亡，其中一人为未成年人。
