In [1]:
import sys
sys.path.append('/root/autodl-tmp/One_Month_Paper')
import pandas as pd
import time
from connector.embedding.embed import bgeEmbeddings
import chromadb
from tqdm import tqdm

In [2]:
search_latency_fmt = "search latency = {:.4f}s"

In [3]:
# 加载 embedding 模型     
start_time = time.time()
embed_model = bgeEmbeddings(
    '/root/autodl-tmp/One_Month_Paper/model/bge-large-en-v1.5', 
    batch_size=64,
    max_len=512,
    device='cuda:0'
)
end_time = time.time()
print("load done embedding model")
print(search_latency_fmt.format(end_time - start_time))


successful load embedding model
load done embedding model
search latency = 1.6819s


In [4]:
# 加载 chroma 数据库 
client = chromadb.Client()
collection = client.create_collection(name="test_collection")


In [5]:
# 读取数据
print("开始读取数据......")
# csv_df = pd.read_csv("datas/train.csv")  # 读取训练数据
# csv_data = csv_df.loc[:15]
# 加载 数据集 
csv_data = pd.read_csv('/root/autodl-tmp/One_Month_Paper/connector/vectorstore/data/merged_data.csv')
print(csv_data.shape)  # (476066, 3)

num_entities = len(csv_data)
print(num_entities)


开始读取数据......
(8069, 6)
8069


In [6]:
print("开始使用向量embedding模型嵌入向量数据......")
result = []
start_time = time.time()
for index,row in tqdm(csv_data.iterrows(), total=num_entities, desc="Processing DataFrame"):
    result.append(embed_model.embed_documents([row['Abstraction']])[0])
end_time = time.time()
print(search_latency_fmt.format(end_time - start_time))


开始使用向量embedding模型嵌入向量数据......


Processing DataFrame: 100%|██████████| 8069/8069 [02:37<00:00, 51.16it/s]

search latency = 157.7388s





In [7]:
print("开始插入向量数据......")
start_time = time.time()
for index, row in tqdm(csv_data.iterrows(), total=num_entities, desc="Processing DataFrame"):
    # result = embed_model.embed_documents([row['Abstraction']])
    collection.add(
        documents=[row['Abstraction']],
        embeddings=[result[index]],
        metadatas=[{"title": row['title'], "author": row['author'], "subject": row['subject']}],
        ids=[str(row['id'])]
    ) 
end_time = time.time()
print(search_latency_fmt.format(end_time - start_time))

开始插入向量数据......


Processing DataFrame: 100%|██████████| 8069/8069 [00:30<00:00, 266.57it/s]

search latency = 30.2776s





In [9]:
print("开始查询")
while True:
    search_sen = input("请输入查询语句：")
    start_time = time.time()
    search_vec = embed_model.embed_query(search_sen)
    end_time = time.time()
    print("just embedding")
    print(search_latency_fmt.format(end_time - start_time))
    start_time = time.time()
    search_res = collection.query(
        query_embeddings=[search_vec],
        n_results=7,
    )
    end_time = time.time()
    result = [{"ids": search_res["ids"][0][i], "meta": search_res["metadatas"][0][i], "doc": search_res["documents"][0][i]} for i in range(len(search_res["ids"][0]))]
    # for it in result:
    #     print(it["ids"] + "   " + it["meta"][:4]+"...    "+it["doc"][:10]+"...")
    # print(result)
    print(search_latency_fmt.format(end_time - start_time))


开始查询
just embedding
search latency = 0.0280s
search latency = 0.0025s
just embedding
search latency = 0.0326s
search latency = 0.0034s


KeyboardInterrupt: Interrupted by user