In [1]:
import pandas as pd
from tqdm import tqdm
from fastembed.sparse.bm25 import Bm25
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer
from fastembed.late_interaction import LateInteractionTextEmbedding

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config = {
    "server_url": "http://localhost:6333",
    "collection_name": "tmp_collection_v3",
    "embeddig_models": {
        "dense": ("dunzhang/stella_en_400M_v5", 1024),
        "interaction": ("colbert-ir/colbertv2.0", 431),
        "bm25": ("Qdrant/bm25", None),
    }
}

In [3]:
path = "/home/anindya/workspace/opensource/company-ai/data/articles.csv"

df = pd.read_csv(path)

In [4]:
# replace all the nan values with empty string
df = df.fillna("")

In [5]:
dataset = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    content = {
        "id": row["article_id"],
        "meta": {
            "color": row["colour_group_name"],
            "type": row["index_name"],
            "section_name": row["section_name"]
        }   
    } 
    product_details = (
        f"{row['index_name']} {row['section_name']} {row['colour_group_name']} {row['detail_desc']}"
    )
    content["product_details"] = product_details
    dataset.append(content)

100%|██████████| 105542/105542 [00:04<00:00, 22334.38it/s]


In [7]:
from qdrant_client import models 

client = QdrantClient(config["server_url"])

dense_model = config["embeddig_models"]["dense"]
interaction_model = config["embeddig_models"]["interaction"]   
bm25 = config["embeddig_models"]["bm25"]

client.create_collection(
    config["collection_name"],
    vectors_config={
        "stella_en_400M_v5": models.VectorParams(
            size=1024,
            distance=models.Distance.COSINE
        ),
        "colbertv2.0": models.VectorParams(
            size=431,
            distance=models.Distance.COSINE,
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM,
            )
        ),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

True

In [6]:
embedding_model = SentenceTransformer(
    "dunzhang/stella_en_400M_v5", 
    trust_remote_code=True
)

late_interaction_embedding_model = LateInteractionTextEmbedding(
    "colbert-ir/colbertv2.0"
)

bm25_embedding_model = Bm25("Qdrant/bm25")

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")
Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 17742.40it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 11125.47it/s]


In [8]:
batch_size =  4

def make_mini_batches(lst, batch_size):
    return [lst[i:i + batch_size] for i in range(0, len(lst), batch_size)]

In [19]:
document_batches = make_mini_batches(dataset, batch_size)  

for batch in tqdm(document_batches, total=len(document_batches)):
    doc_to_embed = [content["product_details"] for content in batch]
    
    dense_embeddings = embedding_model.encode(
        doc_to_embed,
        prompt_name="s2p_query"
    ).tolist()

    bm25_embeddings = list(bm25_embedding_model.passage_embed(
        doc_to_embed
    ))

    late_interaction_embeddings = list(late_interaction_embedding_model.passage_embed(
        doc_to_embed
    ))

    client.upload_points(
        config["collection_name"],
        points=[
            models.PointStruct(
                id=int(batch[i]["id"]),
                vector={
                    "stella_en_400M_v5": dense_embeddings[i],
                    "colbertv2.0": late_interaction_embeddings[i],
                    "bm25": bm25_embeddings[i].as_object()
                },
                payload={
                    "id": int(batch[i]["id"]),
                    "product_details": batch[i]["product_details"],
                    "meta": batch[i]["meta"]
                }
            )
            for i in range(len(batch)) 
        ],
        batch_size=batch_size
    )

100%|██████████| 26386/26386 [2:00:08<00:00,  3.66it/s]  


#### Running a query

In [20]:
run_dict = {}
query = "dotted bra with red color"

query_embedding = embedding_model.encode([query], prompt_name="s2p_query")[0].tolist()

In [27]:
sparse_vectors = list(bm25_embedding_model.query_embed([query]))[0]
late_vectors = list(late_interaction_embedding_model.query_embed([query]))[0]

In [23]:
config

{'server_url': 'http://localhost:6333',
 'collection_name': 'tmp_collection_v3',
 'embeddig_models': {'dense': ('dunzhang/stella_en_400M_v5', 1024),
  'interaction': ('colbert-ir/colbertv2.0', 431),
  'bm25': ('Qdrant/bm25', None)}}

In [31]:
prefetch = [
    models.Prefetch(
        query=query_embedding,
        using="stella_en_400M_v5",
        limit=20
    ),
    models.Prefetch(
        query=models.SparseVector(**sparse_vectors.as_object()),
        using="bm25",
        limit=20
    ),
    models.Prefetch(
        query=late_vectors,
        using="colbertv2.0",
        limit=20
    )
]

In [32]:
results = client.query_points(
    config["collection_name"],
    prefetch=prefetch,
    query=models.FusionQuery(
        fusion=models.Fusion.RRF
    ),
    with_payload=True,
    limit=10
)

UnexpectedResponse: Unexpected Response: 400 (Bad Request)
Raw response content:
b'{"status":{"error":"Wrong input: Vector dimension error: expected dim: 431, got 128"},"time":0.012817853}'