In [None]:
!pip install -U transformers datasets==3.2.0 openai lancedb lance FlagEmbedding "tantivy>=0.20.1" -qq

# NOTE: If there is an import error, restart and run the notebook again

In [None]:
import os
import lancedb
import re
import pandas as pd
import random

from datasets import load_dataset

import torch
import gc

import lance


import os

import lancedb
import openai
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector


os.environ["OPENAI_API_KEY"] = "sk-proj-...."


embeddings = get_registry().get("openai").create()

# Load `Chunks` of data from [BeIR Dataset](https://huggingface.co/datasets/BeIR/scidocs)

Note: This is a dataset built specially for retrieval tasks to see how good your search is working

In [None]:
import os
from datasets import config


queries = load_dataset("BeIR/scidocs", "queries")["queries"].to_pandas()
full_docs = (
    load_dataset("BeIR/scidocs", "corpus")["corpus"].to_pandas().dropna(subset="text")
)

docs = full_docs.head(64)  # just random samples for faster embed demo
docs["num_words"] = docs["text"].apply(
    lambda x: len(x.split())
)  # Insert some Metadata for a more "HYBRID" search
docs.sample(5)

# Build New Table

In [None]:
!rm -rf ./db

In [None]:
class Documents(LanceModel):
    vector: Vector(embeddings.ndims()) = embeddings.VectorField()
    text: str = embeddings.SourceField()
    title: str
    num_words: int


data = docs.apply(
    lambda row: {
        "title": row["title"],
        "text": row["text"],
        "num_words": row["num_words"],
    },
    axis=1,
).values.tolist()

db = lancedb.connect("./db")
table = db.create_table("documents", schema=Documents)

table.add(data)  # ingest docs with auto-vectorization
table.create_fts_index("text")  # Create a fts index before the hybrid search

In [None]:
table.search(
    "To confuse the AI and DNN embedding, let's put random terms from other sentences- automation training test memory?",
    query_type="fts",
).limit(5).to_pandas()

In [None]:
table.search(
    "To confuse the AI and DNN embedding, let's put random terms from other sentences- automation training test memory?",
    query_type="vector",
).limit(10).to_pandas()

## Perform inbuilt Hybrid Search
They have some off the shelf functionalities and a way to implement the custom Re-Ranking and Filtering Function here [Implement Custom Rerankers](https://lancedb.github.io/lancedb/hybrid_search/#building-custom-rerankers)

In [None]:
from lancedb.rerankers import LinearCombinationReranker

reranker = LinearCombinationReranker(
    weight=0.7
)  # Weight = 0 Means pure Text Search (BM-25) and 1 means pure Sementic (Vector) Search

table.search(
    "To confuse the AI and DNN embedding, let's put random terms from other sentences- automation training test memory?",
    query_type="hybrid",
).rerank(reranker=reranker).limit(5).to_pandas()

## Build custom Filtering Function

By passing the `pandas.query` style, filtering, we will do the following 2 things:

1. Remove all the rows which contain a specific term, in out case, `"dual-band"`
2. Keep only the rows which have `num_words > 100`

In [None]:
from typing import List, Union
import pandas as pd
import pyarrow as pa


class MofidifiedLinearReranker(LinearCombinationReranker):
    def __init__(self, filters: Union[str, List[str]], **kwargs):
        super().__init__(**kwargs)
        filters = filters if isinstance(filters, list) else [filters]
        self.filters = filters

    def rerank_hybrid(
        self, query: str, vector_results: pa.Table, fts_results: pa.Table
    ) -> pa.Table:
        combined_result = super().rerank_hybrid(query, vector_results, fts_results)
        df = combined_result.to_pandas()
        for filter in self.filters:
            df = df.query(
                "(not text.str.contains(@filter)) & (num_words > 150) "
            )  # THIS is where you implement your filters. You can hard code or pass dynamically too

        return pa.Table.from_pandas(df)


modified_reranker = MofidifiedLinearReranker(filters=["dual-band"])

table.search(
    "To confuse the AI and DNN embedding, let's put random terms from other sentences- automation training test memory?",
    query_type="hybrid",
).rerank(reranker=modified_reranker).limit(7).to_pandas()