# Sparse Dense and Hybrid Search

In [1]:
import requests
import json

resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')
data = json.loads(resp.text)  

print(type(data), len(data))

def json_print(data):
    print(json.dumps(data, indent=2))

<class 'list'> 10


In [2]:
import weaviate, os
from weaviate.embedded import EmbeddedOptions
from dotenv import load_dotenv
import openai

load_dotenv()

openai.api_key = os.getenv('OPENAI_API_KEY')

client = weaviate.Client(
    embedded_options=EmbeddedOptions(),
    additional_headers={
        "X-OpenAI-Api-Key": openai.api_key
    }
)

print(f"Client created? {client.is_ready()}")

            your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.

            For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
            For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration
            
{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-08-06T00:07:10-05:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-08-06T00:07:10-05:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-08-06T00:07:10-05:00"}
{"level":"info","msg":"module offload-s3 is enabled","time":"2024-08

Client created? True


{"action":"hnsw_prefill_cache_async","level":"info","msg":"not waiting for vector cache prefill, running in background","time":"2024-08-06T00:07:14-05:00","wait_for_cache_prefill":false}
{"level":"info","msg":"Completed loading shard question_479Z9KLU0yZr in 1.430122ms","time":"2024-08-06T00:07:14-05:00"}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-08-06T00:07:14-05:00","took":74126}


In [3]:
if client.schema.exists("Question"):
    client.schema.delete_class("Question")
class_obj = {
    "class": "Question",
    "vectorizer": "text2vec-openai",  
}

client.schema.create_class(class_obj)

{"action":"hnsw_prefill_cache_async","level":"info","msg":"not waiting for vector cache prefill, running in background","time":"2024-08-06T00:07:19-05:00","wait_for_cache_prefill":false}
{"level":"info","msg":"Created shard question_E5xXWIvSAYAO in 2.581115ms","time":"2024-08-06T00:07:19-05:00"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-08-06T00:07:19-05:00","took":220472}


In [4]:
with client.batch.configure(batch_size=5) as batch:
    for i, d in enumerate(data):  
        
        print(f"importing question: {i+1}")
        
        properties = {
            "answer": d["Answer"],
            "question": d["Question"],
            "category": d["Category"],
        }
        
        batch.add_data_object(
            data_object=properties,
            class_name="Question"
        )

importing question: 1
importing question: 2
importing question: 3
importing question: 4
importing question: 5
importing question: 6
importing question: 7
importing question: 8
importing question: 9
importing question: 10


# Dense Search

In [5]:
response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_near_text({"concepts":["animal"]})
    .with_limit(3)
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "answer": "Elephant",
          "question": "It's the only living mammal in the order Proboseidea"
        },
        {
          "answer": "the nose or snout",
          "question": "The gavial looks very much like a crocodile except for this bodily feature"
        },
        {
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        }
      ]
    }
  }
}


## Sparse Search - BM25

In [6]:
response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_bm25(query="animal")
    .with_limit(3)
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        }
      ]
    }
  }
}


## Hybrid search

In [7]:
response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_hybrid(query="animal",alpha=0.5)
    .with_limit(3)
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        },
        {
          "answer": "Elephant",
          "question": "It's the only living mammal in the order Proboseidea"
        },
        {
          "answer": "the nose or snout",
          "question": "The gavial looks very much like a crocodile except for this bodily feature"
        }
      ]
    }
  }
}


In [8]:
response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_hybrid(query="animal",alpha=0)
    .with_limit(3)
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        }
      ]
    }
  }
}


In [9]:
response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_hybrid(query="animal",alpha=1)
    .with_limit(3)
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "answer": "Elephant",
          "question": "It's the only living mammal in the order Proboseidea"
        },
        {
          "answer": "the nose or snout",
          "question": "The gavial looks very much like a crocodile except for this bodily feature"
        },
        {
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        }
      ]
    }
  }
}
