## This note book contains the info about inserting, searching data in weaviate

In [3]:
from weaviate.classes.config import Configure, VectorDistances
from weaviate.classes.query import Filter
import weaviate
import weaviate.classes as wvc
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")

# Connect to Weaviate
client = weaviate.connect_to_local()  # Change this if connecting to a remote instance



metadata = {
    "file_id": "12345",
    "user_id": "user_001",
    "file_type": "pdf",
    "created_at": "2025-03-03T12:00:00Z",
    "modified_at": "2025-03-03T14:30:00Z"
}

chunks = [
    "This is the first chunk of the document.",
    "Second chunk contains some additional data.",
    "Third chunk has more text.",
    "Fourth chunk is here.",
    "Fifth chunk concludes the document."
]
embeddings = embed_model.embed_documents(chunks)
filename = "sample_document.pdf"
# Print some information about the embeddings
print(f"Number of embeddings: {len(embeddings)}")
# Generate the payloads
payloads = []

if metadata["file_id"] is None:
    metadata["file_id"] = "None"

for i, chunk in enumerate(chunks):
    payload = {
        "text": chunk,
        "filename": filename,
        "file_id": metadata["file_id"],
        "user_id": metadata["user_id"],
        "file_type": metadata["file_type"],
        "created_at": metadata["created_at"],
        "modified_at": metadata["modified_at"]
    }
    payloads.append(payload)

print(payloads)  # This will print the payloads before inserting into the database.

client.collections.create(
    "data",
    vectorizer_config=wvc.config.Configure.Vectorizer.none()
)

collection = client.collections.get("data")
i=0

with collection.batch.dynamic() as batch:
    for data_row in payloads:
        batch.add_object(
            properties=data_row,
            vector=embeddings[i]
        ),
        
        i+=1
        if batch.number_errors > 10:
            print("Batch import stopped due to excessive errors.")
            break

failed_objects = collection.batch.failed_objects
if failed_objects:
    print(f"Number of failed imports: {len(failed_objects)}")
    print(f"First failed object: {failed_objects[0]}")

Number of embeddings: 5
[{'text': 'This is the first chunk of the document.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Second chunk contains some additional data.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Third chunk has more text.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Fourth chunk is here.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Fifth chunk concludes the document.', 'filename': 'sample_document.pdf', 'file_

In [4]:
from weaviate.classes.query import MetadataQuery

single_text = "What is present in first and second chunks"
query_vector= embed_model.embed_query(single_text)

jeopardy = client.collections.get("data")
response = jeopardy.query.near_vector(
    near_vector=query_vector, # your query vector goes here
    limit=4,
    return_metadata=MetadataQuery(distance=True)
)
print(response)
for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)

QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('63a3ae4e-63c7-4b2c-b190-1884889b23dd'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=0.17967021465301514, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'file_id': '12345', 'created_at': datetime.datetime(2025, 3, 3, 12, 0, tzinfo=datetime.timezone.utc), 'file_type': 'pdf', 'text': 'Second chunk contains some additional data.', 'user_id': 'user_001', 'filename': 'sample_document.pdf', 'modified_at': datetime.datetime(2025, 3, 3, 14, 30, tzinfo=datetime.timezone.utc)}, references=None, vector={}, collection='Data'), Object(uuid=_WeaviateUUIDInt('14a43c0c-7b93-4cd7-8796-59d05dce74a8'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=0.28102248907089233, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'file_id': '12345', 'created_at': datetime.datetime(2025, 3, 3, 12, 0, tzinfo

In [None]:
client.collections.delete("data")

## By default weaviate says it calculates cosine similairty but in fact it gives us cosine distance

In [5]:
from weaviate.classes.config import Configure, VectorDistances

client.collections.create(
    "Article",
    vector_index_config=Configure.VectorIndex.hnsw(
        distance_metric=VectorDistances.COSINE
    ),
)

<weaviate.collections.collection.sync.Collection at 0x1b93c64fb60>

In [6]:
from weaviate.classes.config import Configure, VectorDistances
from weaviate.classes.query import Filter
import weaviate
import weaviate.classes as wvc
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")

# Connect to Weaviate
client = weaviate.connect_to_local()  # Change this if connecting to a remote instance



metadata = {
    "file_id": "12345",
    "user_id": "user_001",
    "file_type": "pdf",
    "created_at": "2025-03-03T12:00:00Z",
    "modified_at": "2025-03-03T14:30:00Z"
}

chunks = [
    "This is the first chunk of the document.",
    "Second chunk contains some additional data.",
    "Third chunk has more text.",
    "Fourth chunk is here.",
    "Fifth chunk concludes the document."
]
embeddings = embed_model.embed_documents(chunks)
filename = "sample_document.pdf"
# Print some information about the embeddings
print(f"Number of embeddings: {len(embeddings)}")
# Generate the payloads
payloads = []

if metadata["file_id"] is None:
    metadata["file_id"] = "None"

for i, chunk in enumerate(chunks):
    payload = {
        "text": chunk,
        "filename": filename,
        "file_id": metadata["file_id"],
        "user_id": metadata["user_id"],
        "file_type": metadata["file_type"],
        "created_at": metadata["created_at"],
        "modified_at": metadata["modified_at"]
    }
    payloads.append(payload)

print(payloads)  # This will print the payloads before inserting into the database.


collection = client.collections.get("Article")
i=0

with collection.batch.dynamic() as batch:
    for data_row in payloads:
        batch.add_object(
            properties=data_row,
            vector=embeddings[i]
        ),
        
        i+=1
        if batch.number_errors > 10:
            print("Batch import stopped due to excessive errors.")
            break

failed_objects = collection.batch.failed_objects
if failed_objects:
    print(f"Number of failed imports: {len(failed_objects)}")
    print(f"First failed object: {failed_objects[0]}")

Number of embeddings: 5
[{'text': 'This is the first chunk of the document.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Second chunk contains some additional data.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Third chunk has more text.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Fourth chunk is here.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Fifth chunk concludes the document.', 'filename': 'sample_document.pdf', 'file_

In [8]:
from weaviate.classes.query import MetadataQuery

single_text = "This is the first chunk of the document"
query_vector= embed_model.embed_query(single_text)

jeopardy = client.collections.get("Article")
response = jeopardy.query.near_vector(
    near_vector=query_vector, # your query vector goes here
    limit=4,
    return_metadata=MetadataQuery(distance=True)
)
print(response)
for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)

QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('76fdd887-c7df-48a2-85cd-d689fe76f56c'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=0.014779925346374512, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'text': 'This is the first chunk of the document.', 'file_id': '12345', 'file_type': 'pdf', 'created_at': datetime.datetime(2025, 3, 3, 12, 0, tzinfo=datetime.timezone.utc), 'user_id': 'user_001', 'filename': 'sample_document.pdf', 'modified_at': datetime.datetime(2025, 3, 3, 14, 30, tzinfo=datetime.timezone.utc)}, references=None, vector={}, collection='Article'), Object(uuid=_WeaviateUUIDInt('c326e7d1-cdf6-40df-88f7-c5fc07fea9d4'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=0.27406060695648193, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'text': 'Third chunk has more text.', 'file_id': '12345', 'file_type': 'pdf'

New

In [19]:
from weaviate.classes.config import Configure, VectorDistances
from weaviate.classes.query import Filter
import weaviate
import weaviate.classes as wvc
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")

# Connect to Weaviate
client = weaviate.connect_to_local()  # Change this if connecting to a remote instance



metadata = {
    "file_id": "12345",
    "user_id": "user_001",
    "file_type": "pdf",
    "created_at": "2025-03-03T12:00:00Z",
    "modified_at": "2025-03-03T14:30:00Z"
}

chunks = [
    "This is the first chunk of the document.",
    "Second chunk contains some additional data.",
    "Third chunk has more text.",
    "Fourth chunk is here.",
    "Fifth chunk concludes the document."
]
embeddings = embed_model.embed_documents(chunks)
filename = "sample_document.pdf"
# Print some information about the embeddings
print(f"Number of embeddings: {len(embeddings)}")
# Generate the payloads
payloads = []

if metadata["file_id"] is None:
    metadata["file_id"] = "None"

for i, chunk in enumerate(chunks):
    payload = {
        "text": chunk,
        "filename": filename,
        "file_id": metadata["file_id"],
        "user_id": metadata["user_id"],
        "file_type": metadata["file_type"],
        "created_at": metadata["created_at"],
        "modified_at": metadata["modified_at"]
    }
    payloads.append(payload)

print(payloads)  # This will print the payloads before inserting into the database.


collection = client.collections.get("Article")


with collection.batch.dynamic() as batch:
     for data_row, embedding in zip(payloads, embeddings):
        batch.add_object(properties=data_row, vector=embedding)
                    
        if batch.number_errors > 10:
            print("Batch import stopped due to excessive errors.")
            break

failed_objects = collection.batch.failed_objects
if failed_objects:
    print(f"Number of failed imports: {len(failed_objects)}")
    print(f"First failed object: {failed_objects[0]}")

Number of embeddings: 5
[{'text': 'This is the first chunk of the document.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Second chunk contains some additional data.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Third chunk has more text.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Fourth chunk is here.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Fifth chunk concludes the document.', 'filename': 'sample_document.pdf', 'file_

In [22]:
from weaviate.classes.query import MetadataQuery

single_text = "This is the first chunk of the document"
query_vector= embed_model.embed_query(single_text)

jeopardy = client.collections.get("Article")
jeopardy.query.near_vector?
response = jeopardy.query.near_vector(
    near_vector=query_vector, # your query vector goes here
    limit=4,
    return_metadata=MetadataQuery(distance=True, score=True),
    return_properties=["text"],
    include_vector=True
)

for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)
  
    print(o.vector)

{'text': 'This is the first chunk of the document.'}
0.014779925346374512
{'default': [-0.007620947435498238, -0.0004105519619770348, -0.0035287272185087204, -0.0010984173277392983, 0.06992042064666748, -0.020979277789592743, -0.00196237419731915, 0.028891824185848236, -0.0017456462373957038, -0.03858548030257225, -0.031066985800862312, -0.00927596166729927, -0.0648135170340538, -0.0013752381782978773, 0.0022263885475695133, 0.06260683387517929, 0.05819346010684967, -0.02181466668844223, 0.018079062923789024, -0.043030377477407455, 0.015706874430179596, 0.0552932433784008, 0.03956272825598717, 0.02871844172477722, 0.03710384666919708, -0.018867164850234985, 0.02408440038561821, -0.013665691018104553, -0.07162272185087204, 0.039972539991140366, 0.01683386042714119, -0.03325790911912918, -0.013980931602418423, 0.02748900279402733, 0.035496119409799576, -0.03398296236991882, -0.008038641884922981, 0.0062890551052987576, 0.015178847126662731, -0.019308501854538918, -0.04013016074895859, 0.

[1;31mSignature:[0m
[0mjeopardy[0m[1;33m.[0m[0mquery[0m[1;33m.[0m[0mnear_vector[0m[1;33m([0m[1;33m
[0m    [0mnear_vector[0m[1;33m:[0m [0mUnion[0m[1;33m[[0m[0mSequence[0m[1;33m[[0m[0mUnion[0m[1;33m[[0m[0mint[0m[1;33m,[0m [0mfloat[0m[1;33m][0m[1;33m][0m[1;33m,[0m [0mSequence[0m[1;33m[[0m[0mSequence[0m[1;33m[[0m[0mUnion[0m[1;33m[[0m[0mint[0m[1;33m,[0m [0mfloat[0m[1;33m][0m[1;33m][0m[1;33m][0m[1;33m,[0m [0mMapping[0m[1;33m[[0m[0mstr[0m[1;33m,[0m [0mUnion[0m[1;33m[[0m[0mSequence[0m[1;33m[[0m[0mUnion[0m[1;33m[[0m[0mint[0m[1;33m,[0m [0mfloat[0m[1;33m][0m[1;33m][0m[1;33m,[0m [0mSequence[0m[1;33m[[0m[0mSequence[0m[1;33m[[0m[0mUnion[0m[1;33m[[0m[0mint[0m[1;33m,[0m [0mfloat[0m[1;33m][0m[1;33m][0m[1;33m][0m[1;33m,[0m [0mweaviate[0m[1;33m.[0m[0mcollections[0m[1;33m.[0m[0mclasses[0m[1;33m.[0m[0mgrpc[0m[1;33m.[0m[0m_ListOfVectorsQuery[0m[1;33m[[0m[0mSequ

In [23]:
import numpy as np

for o in response.objects:
    print("Properties:", o.properties)
    print("Distance:", o.metadata.distance)
    
    # Format vector with NumPy for cleaner output
    vector = np.array(o.vector)
    print("Vector (formatted):\n", np.array2string(vector, precision=3, threshold=10, edgeitems=5, separator=", "))
    print("-" * 80)  # Separator for readability


Properties: {'text': 'This is the first chunk of the document.'}
Distance: 0.014779925346374512
Vector (formatted):
 {'default': [-0.007620947435498238, -0.0004105519619770348, -0.0035287272185087204, -0.0010984173277392983, 0.06992042064666748, -0.020979277789592743, -0.00196237419731915, 0.028891824185848236, -0.0017456462373957038, -0.03858548030257225, -0.031066985800862312, -0.00927596166729927, -0.0648135170340538, -0.0013752381782978773, 0.0022263885475695133, 0.06260683387517929, 0.05819346010684967, -0.02181466668844223, 0.018079062923789024, -0.043030377477407455, 0.015706874430179596, 0.0552932433784008, 0.03956272825598717, 0.02871844172477722, 0.03710384666919708, -0.018867164850234985, 0.02408440038561821, -0.013665691018104553, -0.07162272185087204, 0.039972539991140366, 0.01683386042714119, -0.03325790911912918, -0.013980931602418423, 0.02748900279402733, 0.035496119409799576, -0.03398296236991882, -0.008038641884922981, 0.0062890551052987576, 0.015178847126662731, -0.0

In [24]:
for i in range(5):
    print(embeddings[i])

[-0.007620947435498238, -0.0004105519619770348, -0.0035287272185087204, -0.0010984173277392983, 0.06992042064666748, -0.020979277789592743, -0.00196237419731915, 0.028891824185848236, -0.0017456462373957038, -0.03858548030257225, -0.031066985800862312, -0.00927596166729927, -0.0648135170340538, -0.0013752381782978773, 0.0022263885475695133, 0.06260683387517929, 0.05819346010684967, -0.02181466668844223, 0.018079062923789024, -0.043030377477407455, 0.015706874430179596, 0.0552932433784008, 0.03956272825598717, 0.02871844172477722, 0.03710384666919708, -0.018867164850234985, 0.02408440038561821, -0.013665691018104553, -0.07162272185087204, 0.039972539991140366, 0.01683386042714119, -0.03325790911912918, -0.013980931602418423, 0.02748900279402733, 0.035496119409799576, -0.03398296236991882, -0.008038641884922981, 0.0062890551052987576, 0.015178847126662731, -0.019308501854538918, -0.04013016074895859, 0.028151007369160652, -0.030735982581973076, -0.002250031568109989, -0.01058421190828085

In [17]:
client.collections.delete("Article")

In [28]:
if not client.collections.exists("1"):
    print("do not exist")
elif client.collections.exists("1"):
    print("exists")

do not exist


In [32]:
client.collections?

[1;31mType:[0m        _Collections
[1;31mString form:[0m <weaviate.collections.collections.sync._Collections object at 0x000002AF5DCE5640>
[1;31mFile:[0m        f:\mini-conda\envs\my_env_name\lib\site-packages\weaviate\collections\collections\sync.py
[1;31mDocstring:[0m   <no docstring>

In [26]:
print(client.collections.delete("Article"))

None


In [29]:
import weaviate
print(weaviate.__version__)


4.11.1
