# BM 25 (Best Matching 25)

- Qdrant does NOT support BM 25.
- It supports the following:
  - Semantic search using embedding models that produce dense vectors.
  - Splade whcih uses sparse vectors (similar to BM 25).
  - [BM-42 ](#bm-42)
    - 'using-hybrid-search'


## Note

- install

```sh
pip install -U "huggingface_hub[cli]"
```

- Scan cached huggingface models

```sh
huggingface-cli scan-cache
```

- Delete cached huggingface models

```sh
huggingface-cli delete-cache
```

<br>

### Check Fast Embed Cache Dir

```py
import os
import tempfile

# Set the default cache directory
default_cache_dir = os.path.join(tempfile.gettempdir(), "fastembed_cache")

# Print the default cache directory
print(f"The default cache directory is: {default_cache_dir}")
```

In [1]:
%load_ext watermark
%watermark -v -p numpy,pandas,polars,torch,lightning --conda

Python implementation: CPython
Python version       : 3.10.14
IPython version      : 8.26.0

numpy    : 1.26.4
pandas   : 2.2.2
polars   : 1.1.0
torch    : 2.2.2
lightning: not installed

conda environment: ai_search



In [2]:
# Built-in library
from pathlib import Path
import re
import json
from typing import Any, Generator, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import numpy.typing as npt
from pprint import pprint
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "info": "#76FF7B",
        "warning": "#FBDDFE",
        "error": "#FF0000",
    }
)
console = Console(theme=custom_theme)

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")


# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [3]:
import os
import tempfile


def print_cache_dir() -> str:
    """
    Prints and returns the default cache directory for fastembed.

    Returns:
        str: The path to the default cache directory.
    """
    default_cache_dir: str = os.path.join(tempfile.gettempdir(), "fastembed_cache")
    print(f"The default cache directory is: {default_cache_dir}")
    return default_cache_dir


def list_all_contents(cache_dir: str) -> None:
    # List contents of the default cache directory
    try:
        contents = os.listdir(cache_dir)
        for item in contents:
            print(item)
    except FileNotFoundError:
        print(f"Directory '{cache_dir}' not found.")


default_cache_dir: str = print_cache_dir()
default_cache_dir

The default cache directory is: /var/folders/ny/dl75sc_x2tb54lsymt5bh5p00000gn/T/fastembed_cache


'/var/folders/ny/dl75sc_x2tb54lsymt5bh5p00000gn/T/fastembed_cache'

In [4]:
list_all_contents(cache_dir=default_cache_dir)

models--Qdrant--bm25
models--qdrant--bge-small-en-v1.5-onnx-q
models--Qdrant--all_miniLM_L6_v2_with_attentions
.locks
fast-bge-small-en
tmp


<br><hr>

## Semantic Search Using fast Embed And Sentence Transformer

In [5]:
import torch

from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

# Using fast embed
from fastembed import TextEmbedding

In [6]:
# Data
docs: list[str] = [
    {
        "id": "doc1",
        "title": "Worst",
        "tag": "none",
        "description": "The interesting thing is that the person in the wrong made the right decision in the end.",
    },
    {
        "id": "doc2",
        "title": "Best",
        "tag": "educational",
        "description": "My favorite book is the cat in the hat, which is about a crazy cat who breaks into a house and creates a crazy afternoon for two kids.",
    },
    {
        "id": "doc3",
        "title": "Okay",
        "tag": "educational",
        "description": "My neighbors let the stray cat stay in their garage, which resulted in my favorite hat that I let them borrow being ruined.",
    },
    {
        "id": "doc4",
        "title": "Best ^2",
        "tag": "educational",
        "description": (
            "A small, mischievous cat wears an oversized, striped hat. The contrast between the cat's size and its large hat creates "
            "a comical image. This clever, hat-wearing cat becomes locally famous for its playful tricks and distinctive appearance."
        ),
    },
]

ids: list[str] = [x.get("id") for x in docs]
description: list[str] = [x.get("description") for x in docs]

In [7]:
# This will trigger the model download and initialization
model = TextEmbedding()
print("[INFO] The model is ready to use.")

embeddings_list = list(model.embed(description))
embeddings_dim: int = len(embeddings_list[0])  # Vector of 384 dimensions
embeddings_dim

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

[INFO] The model is ready to use.


384

In [8]:
# Fast embed uses the method `.embed` to encode/embed
list(model.embed(["Hi there!"]))[0].shape

(384,)

In [9]:
# Define storage

# client = QdrantClient(":memory:")
URL: str = "http://localhost:6333"
qdrant_client = QdrantClient(URL)
INDEX_NAME_1: str = "test_collection"

vectors_config = models.VectorParams(
    size=embeddings_dim,  # Vector size is defined by used model
    distance=models.Distance.COSINE,
)

# Create collection Use recreate_collection for repeated experiments. It attempts to
# remove any existing collection with the same name before creating a new one.
qdrant_client.recreate_collection(
    collection_name=INDEX_NAME_1, vectors_config=vectors_config
)

True

In [10]:
vectors: list[np.ndarray] = list(model.embed(description))

# Note: Qdrant does not support NumPy arrays!
vectors: list[list[float]] = [list(v) for v in vectors]
len(vectors)

4

## Putting It All Together

In [11]:
class TextEmbeddingModel:
    def __init__(self, model: TextEmbedding, batch_size: int = 256) -> None:
        self.model = model
        self.batch_size = batch_size
        self.emb_dim = self.get_dim()

    def __repr__(self) -> str:
        return (
            f"{self.__class__.__name__}"
            f"(model={self.model.__class__.__name__}, emb_dim={self.emb_dim})"
        )

    def encode(self, text: str | list[str]) -> list[list[float]]:
        return self._encode(text)

    def _encode(self, text: str | list[str]) -> list[list[float]]:
        """
        Encode the input text into a list of float vectors.
        NOTE: Qdrant does not support NumPy arrays!
        """
        raw_vec: list[list[np.ndarray]] = list(
            self.model.embed(text, batch_size=self.batch_size)
        )
        vectors: list[list[float]] = [list(v) for v in raw_vec]
        return vectors

    def get_dim(self) -> int:
        return len(self._encode(text="Hi")[0])

In [12]:
emb_model: TextEmbeddingModel = TextEmbeddingModel(model=model)
emb_model

TextEmbeddingModel(model=TextEmbedding, emb_dim=384)

In [13]:
# Note: Qdrant does not support NumPy arrays!
vectors: list[list[float]] = emb_model.encode(description)
len(vectors)

4

In [14]:
# Upload data to collection
qdrant_client.upload_collection(
    collection_name=INDEX_NAME_1, vectors=list(vectors), payload=docs
)

In [15]:
# Get collection details
collection_info: Any = qdrant_client.get_collection(collection_name=INDEX_NAME_1)
console.print(collection_info)

In [16]:
# Send a query to the collection.
query: str = "the cat in the hat"

hits = qdrant_client.search(
    collection_name=INDEX_NAME_1,
    query_vector=emb_model.encode(query)[0],
    limit=3,
)
for hit in hits:
    console.print(hit.payload, "score:", hit.score)

In [17]:
query

'the cat in the hat'

In [18]:
# Add filter
condition_1: Any = models.FieldCondition(
    key="tag", match=models.MatchValue(value="educational")
)
condition_2: Any = models.FieldCondition(
    key="tag", match=models.MatchValue(value="none")
)

query_filter_1 = models.Filter(should=[condition_1, condition_2])

# OR
query_filter_2: dict[str, Any] = {
    "should": [
        {"key": "tag", "match": {"value": "educational"}},
        {"key": "tag", "match": {"value": "none"}},
    ]
}

In [19]:
hits = qdrant_client.search(
    collection_name=INDEX_NAME_1,
    query_vector=emb_model.encode(query)[0],
    query_filter=query_filter_2,
    limit=3,
)
for hit in hits:
    console.print(hit.payload, "score:", hit.score)

### Using Sentence Transformer model to generate sentence embeddings

In [20]:
# Define storage
# client = QdrantClient(":memory:")
URL: str = "http://localhost:6333"
qdrant_client = QdrantClient(URL)
INDEX_NAME_2: str = "test_collection 2"

vectors_config = models.VectorParams(
    size=embeddings_dim,  # Vector size is defined by used model
    distance=models.Distance.COSINE,
)

# Create collection Use recreate_collection for repeated experiments. It attempts to
# remove any existing collection with the same name before creating a new one.
qdrant_client.recreate_collection(
    collection_name=INDEX_NAME_2, vectors_config=vectors_config
)

True

In [21]:
checkpoint: str = "sentence-transformers/all-MiniLM-L6-v2"
device: str | torch.device = "cuda" if torch.cuda.is_available() else "cpu"
# Load the model from HuggingFace Hub
encoder = SentenceTransformer(checkpoint, device=device)

encoder

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [22]:
seq_lenth, emb_dim = (
    encoder.get_max_seq_length(),
    encoder.get_sentence_embedding_dimension(),
)
seq_lenth, emb_dim

(256, 384)

In [23]:
INDEX_NAME_2: str = "test_collection 2"

vectors_config = models.VectorParams(
    size=emb_dim,  # Vector size is defined by used model
    distance=models.Distance.COSINE,
)

# Create collection Use recreate_collection for repeated experiments. It attempts to
# remove any existing collection with the same name before creating a new one.
qdrant_client.recreate_collection(
    collection_name=INDEX_NAME_2, vectors_config=vectors_config
)

True

In [24]:
vectors_2: list[list[float]] = encoder.encode(description).tolist()
len(vectors_2)

4

In [25]:
# Upload data to collection
qdrant_client.upload_collection(
    collection_name=INDEX_NAME_2, vectors=(vectors_2), payload=docs
)

In [26]:
# Send a query to the collection.
query: str = "the cat in the hat"

hits = qdrant_client.search(
    collection_name=INDEX_NAME_2,
    query_vector=list(encoder.encode(query)),
    limit=3,
)
for hit in hits:
    console.print(hit.payload, "score:", hit.score)

### Using Hybrid Search (BM 42) <a id='bm-42'></a>

- Dense Vectors
- Sparse Vectors (Splade)

In [27]:
from fastembed import SparseEmbedding, SparseTextEmbedding, TextEmbedding

In [28]:
from qdrant_client.models import SparseVector

In [29]:
console.print(SparseTextEmbedding.list_supported_models())

In [30]:
console.print(TextEmbedding.list_supported_models())

In [31]:
# Create Sparse Embeddings
sparse_checkpoint: str = "Qdrant/bm42-all-minilm-l6-v2-attentions"
dense_checkpoint: str = "BAAI/bge-small-en"  # 130 MB
batch_size: int = 32

# This triggers the model download
sparse_text_model = SparseTextEmbedding(
    model_name=sparse_checkpoint, batch_size=batch_size
)
dense_text_model = TextEmbedding(model_name=dense_checkpoint, batch_size=batch_size)

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

In [32]:
text: str = "Hi there! My name is Mike and I'm a software engineer."
sparse_embedding: list[Any] = list(sparse_text_model.embed(text))
pprint(sparse_embedding)

[SparseEmbedding(values=array([0.18474873, 0.15723869, 0.38667383, 0.29155952, 0.23703232]),
                 indices=array([ 948991206,  609270800, 1581444980, 1464642605,  856081220]))]


In [33]:
from transformers import AutoTokenizer


def get_tokens_and_weights(
    sparse_embedding: SparseTextEmbedding, model_name: str
) -> dict[str, float]:
    """
    Retrieves tokens and their corresponding weights from a sparse embedding for a given model.

    === COPIED! ===

    Args:
        sparse_embedding (SparseTextEmbedding): The sparse text embedding object.
        model_name (str): The name of the model to use for tokenization.

    Returns:
        dict[str, float]: A dictionary of tokens and their weights, sorted by weight in
        descending order.

    Raises:
        ValueError: If the specified model is not found in the supported models.
    """
    # Find the tokenizer for the model
    tokenizer_source: str | None = None
    for model_info in SparseTextEmbedding.list_supported_models():
        if model_info["model"].lower() == model_name.lower():
            tokenizer_source = model_info["sources"]["hf"]
            break
    if tokenizer_source is None:
        raise ValueError(f"Model {model_name} not found in the supported models.")

    tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(tokenizer_source)
    token_weight_dict: dict[str, float] = {}
    for i in range(len(sparse_embedding.indices)):
        token: str = tokenizer.decode([sparse_embedding.indices[i]])
        weight: float = sparse_embedding.values[i]
        token_weight_dict[token] = weight

    # Sort the dictionary by weights
    token_weight_dict = dict(
        sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True)
    )
    return token_weight_dict

In [34]:
# Test the function with the first SparseEmbedding
print(
    json.dumps(get_tokens_and_weights(sparse_embedding[0], sparse_checkpoint), indent=4)
)

{
    "": 0.2370323172532484
}


In [35]:
class SparseEmbeddingModel:
    def __init__(self, model: SparseEmbedding, batch_size: int = 256) -> None:
        self.model = model
        self.batch_size = batch_size

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}" f"(model={self.model.__class__.__name__})"

    def encode(self, text: str | list[str]) -> list[list[float]]:
        return self._encode(text)

    def _encode(self, text: str | list[str]) -> list[Any]:
        """
        Encode the input text into a list of float vectors.
        NOTE: Qdrant does not support NumPy arrays!
        """
        raw_vec: list[Any] = list(self.model.embed(text, batch_size=self.batch_size))
        return raw_vec

In [36]:
dense_emb_model: TextEmbeddingModel = TextEmbeddingModel(model=dense_text_model)
len(dense_emb_model.encode(["Fastembed is a great library for text embeddings!"]))

1

In [37]:
sparse_emb_model: SparseEmbeddingModel = SparseEmbeddingModel(model=sparse_text_model)
sparse_emb_model.encode(["Fastembed is a great library for text embeddings!"])

[SparseEmbedding(values=array([0.43995775, 0.1121665 , 0.19993707, 0.26299376, 0.3384564 ]), indices=array([ 682147660,  198527388, 1096988414,  970674652,  959644756]))]

In [38]:
collection_name: str = "esci"

qdrant_client.recreate_collection(
    collection_name,
    vectors_config={
        "text-dense": models.VectorParams(
            size=dense_emb_model.emb_dim,
            distance=models.Distance.COSINE,
        )
    },
    sparse_vectors_config={
        "text-sparse": models.SparseVectorParams(
            index=models.SparseIndexParams(
                on_disk=False,
            )
        )
    },
)

True

In [39]:
docs_sparse_embeddings = sparse_emb_model.encode(description)
docs_dense_embeddings = dense_emb_model.encode(description)
docs_sparse_embeddings

[SparseEmbedding(values=array([0.14969568, 0.09335342, 0.15121029, 0.23425646, 0.1376583 ,
        0.16643596, 0.21417318, 0.17146346]), indices=array([ 570245443, 1218230531,    8834632, 1755673004,  842018159,
        1786548735,  590062121,  939215365])),
 SparseEmbedding(values=array([0.15901294, 0.16990311, 0.2502022 , 0.2534479 , 0.21932603,
        0.1823688 , 0.19029973, 0.1122133 , 0.24716893, 0.12398718,
        0.15895014]), indices=array([1941289186,  962346254, 1751422759, 1485265044,  278976114,
        1271411175, 1888595001,  958659146,  643730452,  358389376,
         542041464])),
 SparseEmbedding(values=array([0.27084355, 0.12897235, 0.33127651, 0.26735239, 0.14319685,
        0.32128052, 0.06343308, 0.18268168, 0.2496442 , 0.17412035,
        0.20139362]), indices=array([1196854555,  219850568, 2036852695, 1751422759, 1881538586,
         484699926,  442064690, 1941289186, 1485265044,   36234414,
         356747255])),
 SparseEmbedding(values=array([0.14740428, 0.25

In [40]:
sparse_vector = models.SparseVector(
    indices=[x for row in docs_sparse_embeddings for x in row.indices],
    values=[x for row in docs_sparse_embeddings for x in row.values],
)

dense_vector: list[list[float]] = docs_sparse_embeddings.copy()

In [41]:
ids: list[int] = list(range(1, len(docs) + 1))

In [42]:
type(sparse_embedding)

list

In [43]:
# Not supported by Polars
# df: pl.DataFrame = pl.DataFrame(docs)
# df = df.with_columns(
#     id=pl.int_range(0, len(df)),
#     sparse_vector=sparse_emb_model.encode(description),
#     dense_vector=dense_emb_model.encode(description),
# )
# df

In [44]:
df: pd.DataFrame = pd.DataFrame(docs)
df = df.assign(
    # id=range(0, len(df)),
    sparse_vector=sparse_emb_model.encode(description),
    dense_vector=dense_emb_model.encode(description),
)
df.head()

Unnamed: 0,id,title,tag,description,sparse_vector,dense_vector
0,doc1,Worst,none,The interesting thing is that the person in the wrong made the right decision in the end.,"SparseEmbedding(values=array([0.14969568, 0.09335342, 0.15121029, 0.23425646, 0.1376583 ,\n 0.16643596, 0.21417318, 0.17146346]), indices=array([ 570245443, 1218230531, 8834632, 1755673004, 842018159,\n 1786548735, 590062121, 939215365]))","[-0.048078645, 0.01467914, -0.012452637, -0.03642915, -0.004653536, -0.0053089582, 0.10573982, 0.029634453, -0.0016554621, 0.0287229, -0.012545165, -0.008471039, 0.0006166894, 0.03190773, -0.0125725, 0.029556325, -0.019452937, 0.033809673, -0.052446038, 0.038998958, 0.011848187, -0.044565942, -0.00855681, -0.056914102, -0.0006278614, 0.031495422, -0.0010878078, -0.031419262, -0.038800262, -0.15459208, 0.0081177605, -0.013756268, 0.0017535368, -0.016341988, -0.03165736, 0.0034405112, -0.032791294, 0.020706574, -0.040468074, 0.0072153993, 0.039510034, 0.03777395, -0.022093473, -0.030331263, ..."
1,doc2,Best,educational,"My favorite book is the cat in the hat, which is about a crazy cat who breaks into a house and creates a crazy afternoon for two kids.","SparseEmbedding(values=array([0.15901294, 0.16990311, 0.2502022 , 0.2534479 , 0.21932603,\n 0.1823688 , 0.19029973, 0.1122133 , 0.24716893, 0.12398718,\n 0.15895014]), indices=array([1941289186, 962346254, 1751422759, 1485265044, 278976114,\n 1271411175, 1888595001, 958659146, 643730452, 358389376,\n 542041464]))","[-0.015868006, -0.02084041, 0.0019805457, 0.018129913, 0.015042529, -0.008550446, 0.04986082, 0.026835369, 0.012720196, 0.011698756, -0.0024818666, -0.014757552, 0.0058412747, 0.043222662, 0.046533756, -0.012222821, -0.008858704, -0.0038479283, -0.07243998, 0.042973693, 0.0029487032, 0.0036375923, -0.009932427, -0.035494726, -0.013168423, 0.045112487, -0.024490234, -0.0016997731, -0.029682389, -0.13501835, -0.012910893, 0.001408267, -0.00010038294, -0.034467276, -0.015208953, 0.0036533433, 0.0016916429, -0.0056047607, 0.027560642, 0.07450399, 0.02335611, 0.024201691, -7.665404e-05, 0.02584..."
2,doc3,Okay,educational,"My neighbors let the stray cat stay in their garage, which resulted in my favorite hat that I let them borrow being ruined.","SparseEmbedding(values=array([0.27084355, 0.12897235, 0.33127651, 0.26735239, 0.14319685,\n 0.32128052, 0.06343308, 0.18268168, 0.2496442 , 0.17412035,\n 0.20139362]), indices=array([1196854555, 219850568, 2036852695, 1751422759, 1881538586,\n 484699926, 442064690, 1941289186, 1485265044, 36234414,\n 356747255]))","[-0.0025230697, 0.007829302, 0.02123328, 0.022239283, 0.043842122, -0.04409191, 0.12562664, 0.033505928, -0.012769881, 0.025701609, 0.0066717677, -0.025987275, 0.06264837, 0.022630764, 0.014546923, 0.0022876204, -0.048622757, 0.06319119, -0.07457686, 0.014157831, -0.044407044, -0.046941556, -0.013395932, -0.0552021, -0.040591516, 0.08581222, -0.027931131, 0.0025866476, -0.030401496, -0.12299114, 0.01711748, -0.0019950594, 0.023423158, -0.0026003113, 0.008074183, 0.0012621706, -0.02644635, -0.01352253, -0.019588778, 0.0779239, 0.043555494, 0.04477806, -0.0029049793, -0.007841673, -0.0001372..."
3,doc4,Best ^2,educational,"A small, mischievous cat wears an oversized, striped hat. The contrast between the cat's size and its large hat creates a comical image. This clever, hat-wearing cat becomes locally famous for its playful tricks and distinctive appearance.","SparseEmbedding(values=array([0.14740428, 0.2562321 , 0.25853975, 0.11805804, 0.21566859,\n 0.279406 , 0.18560165, 0.10002381, 0.09175764, 0.11964324,\n 0.06184279, 0.11864131, 0.12177058, 0.092355 , 0.06036505,\n 0.13904627, 0.13938367, 0.14070773, 0.1241206 , 0.07236957,\n 0.08092909]), indices=array([1736980362, 1525845572, 1751422759, 1667382587, 191347167,\n 242130862, 1485265044, 1569927329, 309782534, 2117458213,\n 958659146, 1712984554, 301030427, 1233057667, 1473125286,\n 30662561, 298181609, 1732471882, 1594097914, 236893549,\n...","[-0.021247633, -0.02380462, 0.034073215, 0.026637726, 0.033867214, -0.005683298, 0.12084282, 0.0514646, -0.011850636, 0.026090428, -0.020847997, -0.041520994, 0.027258094, 0.031181991, 0.025421673, -0.009230498, -0.015425141, -0.0010277177, -0.03354282, 0.034722015, 0.0018352456, 0.021946926, -0.013787556, -0.03705059, -0.03725064, 0.017640788, -0.06539482, 0.0038632446, -0.023894457, -0.16818714, 0.007752478, 0.00081574894, 0.010525733, -0.022281442, -0.020858316, -0.01878057, 0.006054431, -0.00017780517, -0.0044783503, 0.06598948, 0.0035978362, 0.017081393, -0.017300868, 0.013280726, -0...."


In [45]:
console.print(df.head(2).to_dict(orient="records"))

In [47]:
def make_points(df: pd.DataFrame) -> list[models.PointStruct]:
    """
    Create a list of PointStruct objects from a pandas DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame containing 'sparse_vector', 'description',
        'dense_vector', 'title', and 'tag' columns.

    Returns:
        list[models.PointStruct]: A list of PointStruct objects representing the data points.
    """
    sparse_vectors: list[list[float]] = df["sparse_vector"].tolist()
    description: list[str] = df["description"].tolist()
    dense_vectors: list[Any] = df["dense_vector"].tolist()
    rows: list[dict[str, Any]] = df.to_dict(orient="records")
    points: list[models.PointStruct] = []

    for idx, (text, sparse_vector, dense_vector) in enumerate(
        zip(description, sparse_vectors, dense_vectors)
    ):
        sparse_vector: SparseVector = SparseVector(
            indices=sparse_vector.indices.tolist(), values=sparse_vector.values.tolist()
        )
        point: models.PointStruct = models.PointStruct(
            id=idx,
            payload={
                "text": text,
                "title": rows[idx]["title"],
                "tag": rows[idx]["tag"],
            },  # Add any additional payload if necessary
            vector={
                "text-sparse": sparse_vector,
                "text-dense": dense_vector,
            },
        )
        points.append(point)
    return points

In [48]:
points: list[models.PointStruct] = make_points(df)

In [49]:
# Upload points to Qdrant
qdrant_client.upsert(collection_name, points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [50]:
def search(
    query_text: str,
    sparse_model: SparseEmbeddingModel = sparse_emb_model,
    dense_model: TextEmbeddingModel = dense_emb_model,
    limit: int = 10,
) -> list[list[models.ScoredPoint]]:
    """
    Perform a search using both sparse and dense embeddings.

    Args:
        query_text (str): The text to search for.
        sparse_model (SparseTextEmbedding): The model for generating sparse embeddings.
        dense_model (TextEmbeddingModel): The model for generating dense embeddings.
        limit (int): The maximum number of results to return.

    Returns:
        list[list[models.ScoredPoint]]: A list of search responses from Qdrant.
    """
    query_sparse_vectors: list[Any] = sparse_model.encode([query_text])
    query_dense_vector: list[list[float]] = dense_model.encode([query_text])

    search_results: list[list[models.ScoredPoint]] = qdrant_client.search_batch(
        collection_name=collection_name,
        requests=[
            models.SearchRequest(
                vector=models.NamedVector(
                    name="text-dense",
                    vector=query_dense_vector[0],
                ),
                limit=limit,
                with_payload=True,
            ),
            models.SearchRequest(
                vector=models.NamedSparseVector(
                    name="text-sparse",
                    vector=SparseVector(
                        indices=query_sparse_vectors[0].indices.tolist(),
                        values=query_sparse_vectors[0].values.tolist(),
                    ),
                ),
                limit=limit,
                with_payload=True,
            ),
        ],
    )

    return search_results

In [51]:
query: str = "the cat in the hat"
search_results = search(query)
console.print(search_results)

In [52]:
# text-dense
console.print(search_results[0])

In [53]:
# text-sparse
console.print(search_results[1])

In [54]:
# Ranking
# We'll combine the results from the two models using Reciprocal Rank Fusion (RRF).
# You can read more about RRF here. We select RRF for this task because: 1. It is a simple
# and effective method for combining search results. 2. It is robust to the differences in
# the ranking scores of the two or more ranking lists. 3. It is easy to implement and requires
# minimal tuning (only one parameter: alpha, which we don't tune here).
def rrf(
    rank_lists: list[tuple[str, int]], alpha: int = 60, default_rank: int = 1000
) -> list[tuple[str, float]]:
    """
    Optimized Reciprocal Rank Fusion (RRF) using NumPy for large rank lists.

    Args:
        rank_lists (list[tuple[str, int]]): A list of rank lists, where each rank list is a
        list of tuples (item, rank).
        alpha (int, optional): The constant used in RRF calculation. Defaults to 60.
        default_rank (int, optional): The default rank for items not present in a list. Defaults to 1000.

    Returns:
        list[tuple[str, float]]: A list of tuples (item, score) sorted by descending RRF score.
    """
    # Consolidate all unique items from all rank lists
    all_items: set[str] = set(item for rank_list in rank_lists for item, _ in rank_list)

    # Create a mapping of items to indices
    item_to_index: dict[str, int] = {item: idx for idx, item in enumerate(all_items)}

    # Initialize a matrix to hold the ranks, filled with the default rank
    rank_matrix: np.ndarray = np.full((len(all_items), len(rank_lists)), default_rank)

    # Fill in the actual ranks from the rank lists
    for list_idx, rank_list in enumerate(rank_lists):
        for item, rank in rank_list:
            rank_matrix[item_to_index[item], list_idx] = rank

    # Calculate RRF scores using NumPy operations
    rrf_scores: np.ndarray = np.sum(1.0 / (alpha + rank_matrix), axis=1)

    # Sort items based on RRF scores
    sorted_indices: np.ndarray = np.argsort(
        -rrf_scores
    )  # Negative for descending order

    # Retrieve sorted items
    sorted_items: list[tuple[str, float]] = [
        (list(item_to_index.keys())[idx], rrf_scores[idx]) for idx in sorted_indices
    ]

    return sorted_items

In [55]:
# Example usage
rank_list1: list[tuple[str, int]] = [("A", 1), ("B", 2), ("C", 3)]
rank_list2: list[tuple[str, int]] = [("B", 1), ("C", 2), ("D", 3)]
rank_list3: list[tuple[str, int]] = [("A", 2), ("D", 1), ("E", 3)]

# Combine the rank lists
sorted_items: list[tuple[str, float]] = rrf([rank_list1, rank_list2, rank_list3])
sorted_items

[('A', 0.033465871107430434),
 ('B', 0.033465871107430434),
 ('D', 0.03320985472238179),
 ('C', 0.03294544435749548),
 ('E', 0.01775980832584606)]

In [56]:
def rank_list(search_result: list[list[models.ScoredPoint]]) -> list[tuple[int, int]]:
    """
    Rank the search results by assigning a rank to each point.

    Args:
        search_result (list[list[models.ScoredPoint]]): A list of ScoredPoint objects representing
        search results.

    Returns:
        list[tuple[int, int]]: A list of tuples containing the point ID and its rank
        (starting from 1).
    """
    return [(point.id, rank + 1) for rank, point in enumerate(search_result)]


def find_point_by_id(
    client: QdrantClient, collection_name: str, rrf_rank_list: list[tuple[int, float]]
) -> list[models.Record]:
    """
    Retrieve points from the Qdrant collection based on their IDs.

    Args:
        client (QdrantClient): The Qdrant client instance.
        collection_name (str): The name of the collection to retrieve points from.
        rrf_rank_list (list[tuple[int, float]]): A list of tuples containing point IDs and
        their RRF scores.

    Returns:
        list[models.Record]: A list of Record objects representing the retrieved points.
    """
    return client.retrieve(
        collection_name=collection_name, ids=[item[0] for item in rrf_rank_list]
    )

In [58]:
docs

[{'id': 'doc1',
  'title': 'Worst',
  'tag': 'none',
  'description': 'The interesting thing is that the person in the wrong made the right decision in the end.'},
 {'id': 'doc2',
  'title': 'Best',
  'tag': 'educational',
  'description': 'My favorite book is the cat in the hat, which is about a crazy cat who breaks into a house and creates a crazy afternoon for two kids.'},
 {'id': 'doc3',
  'title': 'Okay',
  'tag': 'educational',
  'description': 'My neighbors let the stray cat stay in their garage, which resulted in my favorite hat that I let them borrow being ruined.'},
 {'id': 'doc4',
  'title': 'Best ^2',
  'tag': 'educational',
  'description': "A small, mischievous cat wears an oversized, striped hat. The contrast between the cat's size and its large hat creates a comical image. This clever, hat-wearing cat becomes locally famous for its playful tricks and distinctive appearance."}]

In [57]:
dense_rank_list, sparse_rank_list = rank_list(search_results[0]), rank_list(
    search_results[1]
)
# [(id, rank), ...]
console.print(f"{dense_rank_list = }")
console.print(f"{sparse_rank_list = }")

In [59]:
rrf_rank_list = rrf([dense_rank_list, sparse_rank_list])
rrf_rank_list

[(2, 0.032266458495966696),
 (3, 0.032266458495966696),
 (1, 0.03225806451612903),
 (0, 0.016568396226415094)]

In [60]:
result: list[Any] = find_point_by_id(qdrant_client, collection_name, rrf_rank_list)
console.print(result)