In [58]:
import os
import weaviate
from dotenv import load_dotenv

load_dotenv()

auth_config = weaviate.auth.AuthApiKey(
    api_key="G3YW1TL4j6geVGc0H091LO77ysqTob2z5EbU"
)

In [59]:
import cohere

# login https://dashboard.cohere.com/api-keys to create COHERE_API_KEY

client = weaviate.Client(
    url="https://3fmi0udzstkt7afklk464g.c0.us-east1.gcp.weaviate.cloud",
    auth_client_secret=auth_config,
    additional_headers={
        "X-Cohere-Api-Key": os.getenv("COHERE_API_KEY"),
    }
)

co = cohere.Client(os.getenv("COHERE_API_KEY"))

In [3]:
import logging

# Define schema
schema = {
    "classes": [
        {
            "class": "Test",
            "properties": [
                {"name": "title", "dataType": ["text"]},
                {"name": "content", "dataType": ["text"]},
                {"name": "vector", "dataType": ["number[]"]}
            ]
        }
    ]
}

# Check if class 'Test' already exists
existing_classes = client.schema.get()["classes"]
if not any(cls["class"] == "Test" for cls in existing_classes):
    # Create schema in Weaviate
    client.schema.create(schema)
else:
    print("Class 'Test' already exists in Weaviate schema.")

# Add documents to Weaviate
documents = [
    {"title": "Nature 1", "content": "The forest is full of trees."},
    {"title": "Nature 2", "content": "The ocean is vast and blue."},
    {"title": "Nature 3", "content": "Mountains are high and majestic."},
    {"title": "Nature 4", "content": "Rivers flow through the valleys and forest"},
    {"title": "Nature 5", "content": "Deserts are dry and hot."},
    {"title": "Nature 6", "content": "Rainforests are dense and humid."},
    {"title": "Nature 7", "content": "The tundra is cold and barren."},
    {"title": "Nature 8", "content": "Savannas are grassy and open."},
    {"title": "Nature 9", "content": "Lakes are calm and serene."},
    {"title": "Nature 10", "content": "Wetlands are rich in biodiversity."},
    {"title": "Nature 11", "content": "Lot of forest"}
]

# Extract contents for batch embedding
contents = [doc["content"] for doc in documents]

# Embed all documents' contents at once
embeddings = co.embed(texts=contents).embeddings
print(embeddings)

# Add embeddings to documents
for i, doc in enumerate(documents):
    doc["vector"] = embeddings[i]

# Configure batch processing
client.batch.configure(batch_size=10, dynamic=True)

# Add documents to batch
try:
    with client.batch as batch:
        for doc in documents:
            batch.add_data_object(
                data_object=doc,
                class_name="Test"
            )
    logging.info("Documents inserted successfully")
except Exception as e:
    logging.error(f"Error occurred during batch insertion: {e}")

print("Documents inserted successfully")


[[1.7167969, -1.2138672, -0.99902344, -0.50439453, 0.0881958, -0.7270508, 0.6665039, 1.0771484, -2.1503906, 0.58203125, 0.23022461, -0.453125, -1.8837891, 1.1416016, 2.3398438, -0.0803833, -1.5751953, 1.703125, 0.16149902, -2.9101562, -1.6494141, 1.4013672, 0.62353516, 1.1425781, 1.6699219, -1.8857422, 0.39892578, -2.3339844, -1.0761719, -0.44580078, 1.3105469, 0.39135742, -0.4020996, 2.0214844, -2.0898438, -1.5136719, 0.12200928, -0.3479004, 0.011192322, -0.20373535, -0.3203125, -1.0410156, 0.72998047, -1.40625, 2.0976562, -0.5966797, -0.79785156, 1.7294922, 0.8276367, 1.1660156, 0.8071289, -0.057556152, 0.7895508, -2.1347656, 0.5810547, 1.40625, -4.3320312, -0.12249756, 0.6796875, -0.25048828, -0.39794922, -0.67089844, 1.1230469, 0.8486328, -1.4277344, 0.6010742, -2.8886719, 2.765625, -1.0683594, 1.34375, 2.4472656, -1.1552734, -0.123046875, 0.33862305, -1.1425781, -2.0, 2.0351562, 1.1845703, 0.49780273, -0.2980957, -2.0761719, -1.2158203, -1.7998047, -1.5966797, 1.3925781, -0.577148

In [60]:
query = """
{
  Get {
    Test {
      title
      vector
      _additional {
        id
      }
    }
  }
}
"""

# Execute the query
result = client.query.raw(query)

# Filter documents with non-null vectors and get their titles
titles_with_vectors = [
    doc['title'] for doc in result['data']['Get']['Test'] if doc['vector'] is not None
]

print(f"Titles of documents with vectors: {titles_with_vectors}")

Titles of documents with vectors: ['Nature 2', 'Nature 3', 'Nature 7', 'Nature 10', 'Nature 5', 'Nature 1', 'Nature 9', 'Nature 8', 'Nature 6', 'Nature 11', 'Nature 4']


In [61]:
# Sample query
query = "forest"

In [62]:
# Function to perform keyword search
def keyword_search(query):
    response = client.query.get("Test", ["title", "content"]).with_where({
        "operator": "Like",
        "path": ["content"],
        "valueText": f"*{query}*"
    }).do()
    return [result["title"] for result in response["data"]["Get"]["Test"]]



# Function to perform vector search
def dense_retrieval(query,
                    client,
                    properties=["title","content"],
                    num_results=5):
    nearVector = {"concepts": [query]}

    # To filter by language
    
    response = (
        client.query
        .get("Test", properties)
        .with_near_vector(nearVector)
        .with_limit(num_results)
        .do()
    )

    return response['data']['Get']['Test']




In [64]:
# Perform keyword search
keyword_results = keyword_search(query)
print(f"Keyword search results: {keyword_results}")

# Perform vector search
vector_results = dense_retrieval(query,client)
print(f"Vector search results: {vector_results}")




Keyword search results: ['Nature 1', 'Nature 4', 'Nature 6', 'Nature 11']


KeyError: "No 'vector' key in `content` argument."