In [1]:
from pymilvus import MilvusClient, model

Create a new collection with name and dimension of the vector field.

In [2]:
client = MilvusClient("milvus_demo.db")

if client.has_collection(collection_name = "demo_collection"):
    client.drop_collection(collection_name = "demo_collection")

client.create_collection(
    collection_name = "demo_collection",
    dimension = 768
)

Generate vector embeddings with default model.

In [3]:
# This will download a small embedding model "paraphrase-albert-small-v2" (~50MB).
embedding_fn = model.DefaultEmbeddingFunction()
docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]
vectors = embedding_fn.encode_documents(docs)
print("Dim:", embedding_fn.dim, vectors[0].shape)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Dim: 768 (768,)


In [12]:
data = [
    {
        "id": i,
        "vector": vectors[i],
        "text": docs[i],
        "subject": "history"
    } for i in range(len(vectors))
]

print("Data has", len(data), "entities, each with fields: ", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))

Data has 3 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'subject'])
Vector dim: 768


Now insert the data to the demo_collection.

In [13]:
res = client.insert(collection_name = "demo_collection", data = data)
print("Insertion result:", res)

Insertion result: {'insert_count': 3, 'ids': [0, 1, 2]}


Do semantic search by encoding the query and search the similar vectors.

Finally, get the similar vectors and their corresponding scores.

In [14]:
query_vectors = embedding_fn.encode_queries(["Who is Alan Turing?", "When was AI founded?"])
res = client.search(
    collection_name = "demo_collection",
    data = query_vectors,
    limit = 2,
    output_fields = ["text", "subject"]
)

print(res)

data: ["[{'id': 2, 'distance': 0.5859943628311157, 'entity': {'text': 'Born in Maida Vale, London, Turing was raised in southern England.', 'subject': 'history'}}, {'id': 1, 'distance': 0.511825442314148, 'entity': {'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history'}}]", "[{'id': 1, 'distance': 0.5054441690444946, 'entity': {'text': 'Alan Turing was the first person to conduct substantial research in AI.', 'subject': 'history'}}, {'id': 0, 'distance': 0.3748028576374054, 'entity': {'text': 'Artificial intelligence was founded as an academic discipline in 1956.', 'subject': 'history'}}]"] 


In [20]:
res[1]

[{'id': 1,
  'distance': 0.5054441690444946,
  'entity': {'text': 'Alan Turing was the first person to conduct substantial research in AI.',
   'subject': 'history'}},
 {'id': 0,
  'distance': 0.3748028576374054,
  'entity': {'text': 'Artificial intelligence was founded as an academic discipline in 1956.',
   'subject': 'history'}}]