## This note book contains the info about inserting, searching data in weaviate

In [3]:
from weaviate.classes.config import Configure, VectorDistances
from weaviate.classes.query import Filter
import weaviate
import weaviate.classes as wvc
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")

# Connect to Weaviate
client = weaviate.connect_to_local()  # Change this if connecting to a remote instance



metadata = {
    "file_id": "12345",
    "user_id": "user_001",
    "file_type": "pdf",
    "created_at": "2025-03-03T12:00:00Z",
    "modified_at": "2025-03-03T14:30:00Z"
}

chunks = [
    "This is the first chunk of the document.",
    "Second chunk contains some additional data.",
    "Third chunk has more text.",
    "Fourth chunk is here.",
    "Fifth chunk concludes the document."
]
embeddings = embed_model.embed_documents(chunks)
filename = "sample_document.pdf"
# Print some information about the embeddings
print(f"Number of embeddings: {len(embeddings)}")
# Generate the payloads
payloads = []

if metadata["file_id"] is None:
    metadata["file_id"] = "None"

for i, chunk in enumerate(chunks):
    payload = {
        "text": chunk,
        "filename": filename,
        "file_id": metadata["file_id"],
        "user_id": metadata["user_id"],
        "file_type": metadata["file_type"],
        "created_at": metadata["created_at"],
        "modified_at": metadata["modified_at"]
    }
    payloads.append(payload)

print(payloads)  # This will print the payloads before inserting into the database.

client.collections.create(
    "data",
    vectorizer_config=wvc.config.Configure.Vectorizer.none()
)

collection = client.collections.get("data")
i=0

with collection.batch.dynamic() as batch:
    for data_row in payloads:
        batch.add_object(
            properties=data_row,
            vector=embeddings[i]
        ),
        
        i+=1
        if batch.number_errors > 10:
            print("Batch import stopped due to excessive errors.")
            break

failed_objects = collection.batch.failed_objects
if failed_objects:
    print(f"Number of failed imports: {len(failed_objects)}")
    print(f"First failed object: {failed_objects[0]}")

Number of embeddings: 5
[{'text': 'This is the first chunk of the document.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Second chunk contains some additional data.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Third chunk has more text.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Fourth chunk is here.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Fifth chunk concludes the document.', 'filename': 'sample_document.pdf', 'file_

In [4]:
from weaviate.classes.query import MetadataQuery

single_text = "What is present in first and second chunks"
query_vector= embed_model.embed_query(single_text)

jeopardy = client.collections.get("data")
response = jeopardy.query.near_vector(
    near_vector=query_vector, # your query vector goes here
    limit=4,
    return_metadata=MetadataQuery(distance=True)
)
print(response)
for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)

QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('63a3ae4e-63c7-4b2c-b190-1884889b23dd'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=0.17967021465301514, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'file_id': '12345', 'created_at': datetime.datetime(2025, 3, 3, 12, 0, tzinfo=datetime.timezone.utc), 'file_type': 'pdf', 'text': 'Second chunk contains some additional data.', 'user_id': 'user_001', 'filename': 'sample_document.pdf', 'modified_at': datetime.datetime(2025, 3, 3, 14, 30, tzinfo=datetime.timezone.utc)}, references=None, vector={}, collection='Data'), Object(uuid=_WeaviateUUIDInt('14a43c0c-7b93-4cd7-8796-59d05dce74a8'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=0.28102248907089233, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'file_id': '12345', 'created_at': datetime.datetime(2025, 3, 3, 12, 0, tzinfo

In [None]:
client.collections.delete("data")

## By default weaviate says it calculates cosine similairty but in fact it gives us cosine distance

In [5]:
from weaviate.classes.config import Configure, VectorDistances

client.collections.create(
    "Article",
    vector_index_config=Configure.VectorIndex.hnsw(
        distance_metric=VectorDistances.COSINE
    ),
)

<weaviate.collections.collection.sync.Collection at 0x1b93c64fb60>

In [6]:
from weaviate.classes.config import Configure, VectorDistances
from weaviate.classes.query import Filter
import weaviate
import weaviate.classes as wvc
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")

# Connect to Weaviate
client = weaviate.connect_to_local()  # Change this if connecting to a remote instance



metadata = {
    "file_id": "12345",
    "user_id": "user_001",
    "file_type": "pdf",
    "created_at": "2025-03-03T12:00:00Z",
    "modified_at": "2025-03-03T14:30:00Z"
}

chunks = [
    "This is the first chunk of the document.",
    "Second chunk contains some additional data.",
    "Third chunk has more text.",
    "Fourth chunk is here.",
    "Fifth chunk concludes the document."
]
embeddings = embed_model.embed_documents(chunks)
filename = "sample_document.pdf"
# Print some information about the embeddings
print(f"Number of embeddings: {len(embeddings)}")
# Generate the payloads
payloads = []

if metadata["file_id"] is None:
    metadata["file_id"] = "None"

for i, chunk in enumerate(chunks):
    payload = {
        "text": chunk,
        "filename": filename,
        "file_id": metadata["file_id"],
        "user_id": metadata["user_id"],
        "file_type": metadata["file_type"],
        "created_at": metadata["created_at"],
        "modified_at": metadata["modified_at"]
    }
    payloads.append(payload)

print(payloads)  # This will print the payloads before inserting into the database.


collection = client.collections.get("Article")
i=0

with collection.batch.dynamic() as batch:
    for data_row in payloads:
        batch.add_object(
            properties=data_row,
            vector=embeddings[i]
        ),
        
        i+=1
        if batch.number_errors > 10:
            print("Batch import stopped due to excessive errors.")
            break

failed_objects = collection.batch.failed_objects
if failed_objects:
    print(f"Number of failed imports: {len(failed_objects)}")
    print(f"First failed object: {failed_objects[0]}")

Number of embeddings: 5
[{'text': 'This is the first chunk of the document.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Second chunk contains some additional data.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Third chunk has more text.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Fourth chunk is here.', 'filename': 'sample_document.pdf', 'file_id': '12345', 'user_id': 'user_001', 'file_type': 'pdf', 'created_at': '2025-03-03T12:00:00Z', 'modified_at': '2025-03-03T14:30:00Z'}, {'text': 'Fifth chunk concludes the document.', 'filename': 'sample_document.pdf', 'file_

In [8]:
from weaviate.classes.query import MetadataQuery

single_text = "This is the first chunk of the document"
query_vector= embed_model.embed_query(single_text)

jeopardy = client.collections.get("Article")
response = jeopardy.query.near_vector(
    near_vector=query_vector, # your query vector goes here
    limit=4,
    return_metadata=MetadataQuery(distance=True)
)
print(response)
for o in response.objects:
    print(o.properties)
    print(o.metadata.distance)

QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('76fdd887-c7df-48a2-85cd-d689fe76f56c'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=0.014779925346374512, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'text': 'This is the first chunk of the document.', 'file_id': '12345', 'file_type': 'pdf', 'created_at': datetime.datetime(2025, 3, 3, 12, 0, tzinfo=datetime.timezone.utc), 'user_id': 'user_001', 'filename': 'sample_document.pdf', 'modified_at': datetime.datetime(2025, 3, 3, 14, 30, tzinfo=datetime.timezone.utc)}, references=None, vector={}, collection='Article'), Object(uuid=_WeaviateUUIDInt('c326e7d1-cdf6-40df-88f7-c5fc07fea9d4'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=0.27406060695648193, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'text': 'Third chunk has more text.', 'file_id': '12345', 'file_type': 'pdf'