In [1]:
# install required packages
!pip install dashvector dashscope
!pip install transformers_stream_generator python-dotenv

Collecting dashvector
  Downloading dashvector-1.0.1-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.3/91.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dashscope
  Downloading dashscope-1.7.2-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dashvector, dashscope
Successfully installed dashscope-1.7.2 dashvector-1.0.1
Collecting transformers_stream_generator
  Downloading transformers-stream-generator-0.0.4.tar.gz (12 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Collecting transformers>=4.26.1 (from transformers_stream_generator)
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m


In [2]:
# prepare news corpus as knowledge source
!git clone https://github.com/shijiebei2009/CEC-Corpus.git

Cloning into 'CEC-Corpus'...
remote: Enumerating objects: 1278, done.[K
remote: Total 1278 (delta 0), reused 0 (delta 0), pack-reused 1278[K
Receiving objects: 100% (1278/1278), 1.19 MiB | 17.35 MiB/s, done.
Resolving deltas: 100% (371/371), done.


In [3]:
import dashscope
import os
from dashscope import TextEmbedding
from dashvector import Client, Doc

# [Note: get your DashScope API key here first: https://dashscope.console.aliyun.com/apiKey]
dashscope.api_key='YOUR-DASHSCOPE-API-KEY'

# 初始化 DashVector client
# [Note: get your DashVector API key here first: https://dashvector.console.aliyun.com/cn-hangzhou/api-key]
dashvector_client = Client(api_key='YOUR-DASHVECTOR-API-KEY')

# define collection name
collection_name = 'news_embeddings'

# delete if already exist
dashvector_client.delete(collection_name)

# create a collection with embedding size of 1536
rsp = dashvector_client.create(collection_name, 1536)
collection = dashvector_client.get(collection_name)


In [4]:
def prepare_data_from_dir(path, size):
    # prepare the data from a file folder in order to upsert to DashVector with a reasonable doc's size.
    batch_docs = []
    for file in os.listdir(path):
        with open(path + '/' + file, 'r', encoding='utf-8') as f:
            batch_docs.append(f.read())
            if len(batch_docs) == size:
                yield batch_docs[:]
                batch_docs.clear()

    if batch_docs:
        yield batch_docs

In [5]:
def prepare_data_from_file(path, size):
    # prepare the data from file in order to upsert to DashVector with a reasonable doc's size.
    batch_docs = []
    chunk_size = 12
    with open(path, 'r', encoding='utf-8') as f:
        doc = ''
        count = 0
        for line in f:
            if count < chunk_size and line.strip() != '':
                doc += line
                count += 1
            if count == chunk_size:
                batch_docs.append(doc)
                if len(batch_docs) == size:
                    yield batch_docs[:]
                    batch_docs.clear()
                doc = ''
                count = 0

    if batch_docs:
        yield batch_docs

In [6]:
def generate_embeddings(docs):
    # create embeddings via DashScope's TextEmbedding model API
    rsp = TextEmbedding.call(model=TextEmbedding.Models.text_embedding_v1,
                             input=docs)
    embeddings = [record['embedding'] for record in rsp.output['embeddings']]
    return embeddings if isinstance(docs, list) else embeddings[0]

In [11]:
# create embeddings and insert them into DashVector.
# Note: this may take a while (up to 5 mins) to run.

id = 0
dir_name = 'CEC-Corpus/raw corpus/allSourceText'

# indexing the raw docs with index to DashVector
collection = dashvector_client.get(collection_name)

batch_size = 20

for news in list(prepare_data_from_dir(dir_name, batch_size)):
    ids = [id + i for i, _ in enumerate(news)]
    id += len(news)
    # generate embedding from raw docs
    vectors = generate_embeddings(news)
    # upsert and index
    ret = collection.upsert(
        [
            Doc(id=str(id), vector=vector, fields={"raw": doc})
            for id, doc, vector in zip(ids, news, vectors)
        ]
    )
    print(ret)


{"code": 0, "message": "Success", "requests_id": "04376f49-e2bf-4630-ae93-9daf11e7486c"}
{"code": 0, "message": "Success", "requests_id": "58c25fb2-4efd-4cbe-8b44-26ba4cb7473e"}
{"code": 0, "message": "Success", "requests_id": "b7fed069-80d4-4e5c-a778-dd1c9b24c70b"}
{"code": 0, "message": "Success", "requests_id": "f0ffa55a-3f3f-4cae-8736-105a3e766444"}
{"code": 0, "message": "Success", "requests_id": "1023f054-edd8-4da0-b558-bffe51deac6d"}
{"code": 0, "message": "Success", "requests_id": "09d0c413-93c2-4b09-acf9-1108e00f4947"}
{"code": 0, "message": "Success", "requests_id": "cf494fac-a60a-467d-86cf-b6ca549ddbe0"}
{"code": 0, "message": "Success", "requests_id": "4fd92e67-7037-42da-8469-ec48c8371ba9"}
{"code": 0, "message": "Success", "requests_id": "8e54cd88-0aa8-464b-9d8d-2e042446df49"}
{"code": 0, "message": "Success", "requests_id": "8a9ba99a-db03-474c-9f4a-ecbc16f714d6"}
{"code": 0, "message": "Success", "requests_id": "5fdc22a8-84e1-4acc-93a7-f4dfc89c7318"}
{"code": 0, "message"

In [12]:
# check the collection status
collection = dashvector_client.get(collection_name)
rsp = collection.stats()
print(rsp)

{"code": 0, "message": "Success", "requests_id": "c83917d4-1251-454e-9311-1077e12e1e6e", "output": {"total_doc_count": 332, "index_completeness": 1.0, "partitions": {"default": {"total_doc_count": 332}}}}


In [13]:
def search_relevant_context(question, topk=1, client=dashvector_client):
    # query and recall the relevant information
    collection = client.get(collection_name)

    # recall the top k similarity results from DashVector
    rsp = collection.query(generate_embeddings(question), output_fields=['raw'],
                           topk=topk)
    return "".join([item.fields['raw'] for item in rsp.output])

In [72]:
import dashscope
import textwrap
from dashscope import Generation

# define a prompt template for the vectorDB-enhanced LLM generation
def answer_question(model_name, question, context):
    prompt = f'''请基于```内的内容回答问题。"
	```
	{context}
	```
	我的问题是：{question}。
    '''
    response = Generation.call(
      model= model_name,
      prompt=prompt
    )
    # print(prompt)
    return response.output['text']

In [78]:
# test Q&A on plain LLM without vectorDB enhancement
model_name = 'qwen-7b-chat-v1'
question = '海南安定追尾事故，发生在哪里？原因是什么？人员伤亡情况如何？'
answer = answer_question(model_name, question, '')
print(f'问题: {question}\n' f'回答: {textwrap.fill(answer, width=50)}')

问题: 海南安定追尾事故，发生在哪里？原因是什么？人员伤亡情况如何？
回答: 很抱歉，我无法提供关于该事故的最新信息。请您查阅可靠的新闻来源以获取最新信息。


In [79]:
# test Q&A with knowledge enhancement through DashVector
context = search_relevant_context(question, topk=2)
answer = answer_question(model_name, question, context)
print(f'问题: {question}\n' f'回答: {textwrap.fill(answer, width=50)}')

问题: 海南安定追尾事故，发生在哪里？原因是什么？人员伤亡情况如何？
回答: 海南安定追尾事故发生在海南省定安县境内，环岛东线高速公路海口往三亚方向53公里处。原因是琼AB711
9小轿车驾驶人追尾所致。该事故造成小轿车人员5人当场死亡，其中一人为未成年人。
