##  安装DashScope和DashVector Python SDK

In [1]:
# install required packages
!pip install dashvector dashscope

Collecting dashvector
  Downloading dashvector-1.0.1-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.3/91.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dashscope
  Downloading dashscope-1.7.2-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dashvector, dashscope
Successfully installed dashscope-1.7.2 dashvector-1.0.1


## 下载数据

In [7]:
# prepare news corpus as knowledge source
!git clone https://github.com/shijiebei2009/CEC-Corpus.git

Cloning into 'CEC-Corpus'...
remote: Enumerating objects: 1278, done.[K
remote: Total 1278 (delta 0), reused 0 (delta 0), pack-reused 1278[K
Receiving objects: 100% (1278/1278), 1.19 MiB | 3.35 MiB/s, done.
Resolving deltas: 100% (371/371), done.


## 配置DashScope API key以及初始化向量数据库DashVector

In [2]:
import dashscope
import os
from dashscope import TextEmbedding
from dashvector import Client, Doc

# [Note: get your DashScope API key here first: https://dashscope.console.aliyun.com/apiKey]
dashscope.api_key='YOUR-DASHSCOPE-API-KEY'


# 初始化 DashVector client
# [Note: get your DashVector API key here first: https://dashvector.console.aliyun.com/cn-hangzhou/api-key]
dashvector_client = Client(api_key='YOUR-DASHVECTOR-API-KEY')


# define collection name
collection_name = 'news_embeddings'

# delete if already exist
dashvector_client.delete(collection_name)

# create a collection with embedding size of 1536
collection_dimension = 1536
rsp = dashvector_client.create(collection_name, collection_dimension)
collection = dashvector_client.get(collection_name)


## 数据读取

In [3]:
def prepare_data_from_dir(path, size):
    # prepare the data from a file folder in order to upsert to DashVector with a reasonable doc's size.
    batch_docs = []
    for file in os.listdir(path):
        with open(path + '/' + file, 'r', encoding='utf-8') as f:
            batch_docs.append(f.read())
            if len(batch_docs) == size:
                yield batch_docs[:]
                batch_docs.clear()

    if batch_docs:
        yield batch_docs

In [4]:
def prepare_data_from_file(path, size):
    # prepare the data from file in order to upsert to DashVector with a reasonable doc's size.
    batch_docs = []
    chunk_size = 12
    with open(path, 'r', encoding='utf-8') as f:
        doc = ''
        count = 0
        for line in f:
            if count < chunk_size and line.strip() != '':
                doc += line
                count += 1
            if count == chunk_size:
                batch_docs.append(doc)
                if len(batch_docs) == size:
                    yield batch_docs[:]
                    batch_docs.clear()
                doc = ''
                count = 0

    if batch_docs:
        yield batch_docs

## 利用Embedding API生成文本向量

In [5]:
def generate_embeddings(docs):
    # create embeddings via DashScope's TextEmbedding model API
    rsp = TextEmbedding.call(model=TextEmbedding.Models.text_embedding_v1,
                             input=docs)
    embeddings = [record['embedding'] for record in rsp.output['embeddings']]
    return embeddings if isinstance(docs, list) else embeddings[0]

## 向量入库(使用batch 插入方式进入DashVector)

In [8]:
# create embeddings and insert them into DashVector.
# Note: this may take a while to run.

id = 0
dir_name = 'CEC-Corpus/raw corpus/allSourceText'

# indexing the raw docs with index to DashVector
collection = dashvector_client.get(collection_name)

# choose batch size of 20
batch_size = 20

for news in list(prepare_data_from_dir(dir_name, batch_size)):
    ids = [id + i for i, _ in enumerate(news)]
    id += len(news)
    # generate embedding from raw docs
    vectors = generate_embeddings(news)
    # upsert and index
    ret = collection.upsert(
        [
            Doc(id=str(id), vector=vector, fields={"raw": doc})
            for id, doc, vector in zip(ids, news, vectors)
        ]
    )
    print(ret)


{"code": 0, "message": "Success", "requests_id": "2b7b915f-9f7a-48b4-93a3-33668fe13ff1"}
{"code": 0, "message": "Success", "requests_id": "e2243189-8bf3-4091-880d-407e1edde153"}
{"code": 0, "message": "Success", "requests_id": "75899f5e-7479-4030-b50a-c5600a38d090"}
{"code": 0, "message": "Success", "requests_id": "c2eef624-18be-44e1-ae14-80abf40c5c64"}
{"code": 0, "message": "Success", "requests_id": "9ce08195-6a42-4dc4-ade2-eebe63c9a63b"}
{"code": 0, "message": "Success", "requests_id": "0c07b224-29a6-48b4-b64d-d6fb22a4111a"}
{"code": 0, "message": "Success", "requests_id": "e503f192-9200-47a1-90ba-a408eacac732"}
{"code": 0, "message": "Success", "requests_id": "6a215d35-e0a3-49d6-ba06-9e99a5a2b084"}
{"code": 0, "message": "Success", "requests_id": "2af5cfcd-8f22-4120-b023-fb088b78de2e"}
{"code": 0, "message": "Success", "requests_id": "ca9420cc-24db-4140-95b9-923bb02905ee"}
{"code": 0, "message": "Success", "requests_id": "a364b991-101d-4d97-86aa-b3757c2564a1"}
{"code": 0, "message"

In [9]:
# check the collection status
collection = dashvector_client.get(collection_name)
rsp = collection.stats()
print(rsp)

{"code": 0, "message": "Success", "requests_id": "60322db2-259f-46ec-81de-de64f1e1e190", "output": {"total_doc_count": 332, "index_completeness": 1.0, "partitions": {"default": {"total_doc_count": 332}}}}


## 向量检索(知识库检索)

In [10]:
def search_relevant_context(question, topk=1, client=dashvector_client):
    # query and recall the relevant information
    collection = client.get(collection_name)

    # recall the top k similarity results from DashVector
    rsp = collection.query(generate_embeddings(question), output_fields=['raw'],
                           topk=topk)
    return "".join([item.fields['raw'] for item in rsp.output])

## 基于LLM的问答

In [34]:
import textwrap
from dashscope import Generation

# format prompt for different LLMs
def format_prompt(model_name, prompt):
    # special treatment for ziya-family of models
    if model_name.startswith('ziya'):
        return f'<human>:{prompt}\n<bot>:'
    return prompt

# format history for different LLMs
def format_history(model_name):
    # special treatment for chatglm-family of models
    if model_name.startswith('chatglm'):
        return []
    return None

def format_answer(model_name, response):
    try:
      # special treatment for chatglm-family of models
      if model_name.startswith('chatglm'):
          return response.output['text']['response']
      return response.output['text']
    except:
        print(response)
        raise


# define a prompt template for the vectorDB-enhanced LLM generation
def answer_question(model_name, question, context):
    prompt = f'''请基于```内的内容回答问题。"
	```
	{context}
	```
	我的问题是：{question}
    '''
    formatted_prompt = format_prompt(model_name=model_name, prompt=prompt)
    history = format_history(model_name)

    response = Generation.call(
      model= model_name,
      prompt=formatted_prompt,
      history = history
    )
    #print(prompt)
    answer = format_answer(model_name, response)
    return answer

## 原生LLM问题问答（无知识库检索增强）

In [41]:
# test Q&A on plain LLM without vectorDB enhancement
# candidate LLMs on DashScope include:
# qwen-turo, chatglm-6b-v2, ziya-llama-13b-v1, baichuan-7b-v1, and more...
model_name = 'qwen-turo'
question = '海南安定追尾事故，发生在哪里？原因是什么？人员伤亡情况如何？'
answer = answer_question(model_name, question, '')
print(f'问题: {question}\n' f'回答: {textwrap.fill(answer, width=50)}')

问题: 海南安定追尾事故，发生在哪里？原因是什么？人员伤亡情况如何？
回答: 很抱歉，我无法提供关于该事故的最新信息。请您查阅可靠的新闻来源以获取最新信息。


## 基于知识库的LLM问题问答（利用DashVector的检索增强）

In [42]:
# test Q&A with knowledge enhancement through DashVector
context = search_relevant_context(question, topk=2)
answer = answer_question(model_name, question, context)
print(f'问题: {question}\n' f'回答: {textwrap.fill(answer, width=50)}')

问题: 海南安定追尾事故，发生在哪里？原因是什么？人员伤亡情况如何？
回答: 海南安定追尾事故发生在海南省定安县境内，环岛东线高速公路海口往三亚方向53公里处。原因是琼AB711
9小轿车驾驶人追尾所致。该事故造成小轿车人员5人当场死亡，其中一人为未成年人。
