In [1]:
import chromadb

In [2]:
chroma_clinet = chromadb.HttpClient(host = "localhost", port=8000)

ValueError: Could not connect to tenant default_tenant. Are you sure it exists?

In [8]:
from chromadb.utils import embedding_functions
model_path = "../model/gte-large-zh"
em_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name = model_path)

In [9]:
collection = chroma_clinet.create_collection(name='rag_db', embedding_function=em_fn, metadata={"hnsw:space": "cosine"})

In [10]:
documents=["检索增强生成（Retrieval-augmented Generation），简称RAG，是当下热门的大模型前沿技术之一。",
           "检索增强生成模型结合了语言模型和信息检索技术。具体来说，当模型需要生成文本或者回答问题时，",
           "它会先从一个庞大的文档集合中检索出相关的信息，然后利用这些检索到的信息来指导文本的生成，从而提高预测的质量和准确性 。"]

In [11]:
collection.add(documents = documents,
              ids = ["id1", "id2", "id3"],
              metadatas=[{"chapter":3, "verse":16},
                        {"chapter":4, "verse":5},
                        {"chapter":12, "verse":5}])

In [12]:
collection.count()

3

In [34]:
#collection.peek(limit=1)

In [15]:
get_collection = chroma_clinet.get_collection(name="rag_db", embedding_function=em_fn)

In [19]:
id_result = get_collection.get(ids=['id2'], include=["documents", "embeddings","metadatas"])

In [21]:
print(id_result["documents"])

['检索增强生成模型结合了语言模型和信息检索技术。具体来说，当模型需要生成文本或者回答问题时，']


In [23]:
import numpy as np
np.array(id_result["embeddings"]).shape

(1, 1024)

In [24]:
query = "检索增强技术简称是什么？"

In [26]:
get_collection.query(query_texts=query,
                    n_results=2,
                    include=["documents","metadatas"])

{'ids': [['id2', 'id1']],
 'distances': None,
 'embeddings': None,
 'metadatas': [[{'chapter': 4, 'verse': 5}, {'chapter': 3, 'verse': 16}]],
 'documents': [['检索增强生成模型结合了语言模型和信息检索技术。具体来说，当模型需要生成文本或者回答问题时，',
   '检索增强生成（Retrieval-augmented Generation），简称RAG，是当下热门的大模型前沿技术之一。']],
 'uris': None,
 'data': None,
 'included': ['documents', 'metadatas']}

In [27]:
get_collection.query(query_texts=query,
                    n_results=2,
                    include=["documents","metadatas"],
                    where={"verse":5})

{'ids': [['id2', 'id3']],
 'distances': None,
 'embeddings': None,
 'metadatas': [[{'chapter': 4, 'verse': 5}, {'chapter': 12, 'verse': 5}]],
 'documents': [['检索增强生成模型结合了语言模型和信息检索技术。具体来说，当模型需要生成文本或者回答问题时，',
   '它会先从一个庞大的文档集合中检索出相关的信息，然后利用这些检索到的信息来指导文本的生成，从而提高预测的质量和准确性 。']],
 'uris': None,
 'data': None,
 'included': ['documents', 'metadatas']}

In [28]:
# $eq: equal to
# $ne not equal to
# $gt greater than
# $gte greater than or equal to
# $lt -less than
# $lte less than or equal to
get_collection.query(query_texts=query,
                    n_results=2,
                    include=["documents","metadatas"],
                    where={"chapter":{"$lt":10}})

{'ids': [['id2', 'id1']],
 'distances': None,
 'embeddings': None,
 'metadatas': [[{'chapter': 4, 'verse': 5}, {'chapter': 3, 'verse': 16}]],
 'documents': [['检索增强生成模型结合了语言模型和信息检索技术。具体来说，当模型需要生成文本或者回答问题时，',
   '检索增强生成（Retrieval-augmented Generation），简称RAG，是当下热门的大模型前沿技术之一。']],
 'uris': None,
 'data': None,
 'included': ['documents', 'metadatas']}

In [30]:
get_collection.query(query_texts=query,
                    n_results=2,
                    include=["documents","metadatas"],
                    where={"$and":[{"chapter":{"$lt":10}},
                                  {"verse":{"$eq":5}}
                                  ]}
                    )

{'ids': [['id2']],
 'distances': None,
 'embeddings': None,
 'metadatas': [[{'chapter': 4, 'verse': 5}]],
 'documents': [['检索增强生成模型结合了语言模型和信息检索技术。具体来说，当模型需要生成文本或者回答问题时，']],
 'uris': None,
 'data': None,
 'included': ['documents', 'metadatas']}

In [32]:
get_collection.query(query_texts=query,
                    n_results=2,
                    include=["documents","metadatas"],
                    where_document={"$contains":"检索"}
                    )

{'ids': [['id2', 'id1']],
 'distances': None,
 'embeddings': None,
 'metadatas': [[{'chapter': 4, 'verse': 5}, {'chapter': 3, 'verse': 16}]],
 'documents': [['检索增强生成模型结合了语言模型和信息检索技术。具体来说，当模型需要生成文本或者回答问题时，',
   '检索增强生成（Retrieval-augmented Generation），简称RAG，是当下热门的大模型前沿技术之一。']],
 'uris': None,
 'data': None,
 'included': ['documents', 'metadatas']}

In [35]:
# use langchain

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
model_path = "../model/gte-large-zh"
model  =HuggingFaceEmbeddings(model_name = model_path,
                             model_kwargs={'device':"cpu"})
embeddings = model.embed_documents(documents)

  model  =HuggingFaceEmbeddings(model_name = model_path,


In [36]:
print(embeddings)

[[0.0022275112569332123, 0.010900265537202358, -0.06066451221704483, -0.002290590200573206, 0.002745521254837513, -0.04352393001317978, 0.0008226751815527678, -0.03207527473568916, -0.043596163392066956, -0.0686747133731842, 0.011284084059298038, -0.020719366148114204, -0.0024540547747164965, 0.022484121844172478, 0.003132150275632739, 0.0173509381711483, 0.010060092434287071, -0.017020296305418015, -0.012759963050484657, -0.020756825804710388, -0.028273703530430794, -0.014233429916203022, 0.013630947098135948, 0.03554732725024223, 0.025466103106737137, -0.013927259482443333, -0.03082040138542652, 0.015462035313248634, 0.034657541662454605, -0.01897253841161728, 0.047046076506376266, -0.0033104163594543934, -0.0547090545296669, 0.049779120832681656, -0.025789398699998856, -0.016000304371118546, -0.011776601895689964, -0.02912682294845581, 0.023769598454236984, 0.10458575934171677, 0.005735477432608604, 0.07761510461568832, 0.008250909857451916, 0.03875977173447609, -0.02995406836271286