In [1]:
import pandas as pd
from tqdm import tqdm
from fastembed.sparse.bm25 import Bm25
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer
from fastembed.late_interaction import LateInteractionTextEmbedding

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
path = "/home/anindya/workspace/opensource/company-ai/data/articles.csv"

df = pd.read_csv(path)
df = df.fillna("")

In [61]:
dataset = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    content = {
        "id": row["article_id"],
        "meta": {
            "color": row["colour_group_name"],
            "type": row["index_name"],
            "section_name": row["section_name"]
        }   
    } 
    product_details = (
        f"{row['index_name']} {row['section_name']} {row['colour_group_name']} {row['detail_desc']}"
    )
    content["product_details"] = product_details
    dataset.append(content)

100%|██████████| 105542/105542 [00:03<00:00, 26530.55it/s]


In [5]:
# Load different models

embedding_model = SentenceTransformer(
    "dunzhang/stella_en_400M_v5", 
    trust_remote_code=True
)

late_interaction_embedding_model = LateInteractionTextEmbedding(
    "colbert-ir/colbertv2.0"
)

bm25_embedding_model = Bm25("Qdrant/bm25")

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")
Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 101803.50it/s]
Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 26379.27it/s]


In [None]:
config = {
    "collection_name": "handm_v1",
    "collection_uri": "http://localhost:6333",
    "emebedding_models": {
        "dense_embedding": {
            "download_uri": "dunzhang/stella_en_400M_v5",
            "size": 1024,
            "alias": "stella_en_400M_v5"
        },
        "sparse_embedding": {
            "download_uri": "Qdrant/bm25",
            "size": None,
            "alias": "bm25"
        },
        "late_interaction": {
            "download_uri": "colbert-ir/colbertv2.0",
            "size": 128,
            "alias": "late_interaction"
        }
    }
}

In [62]:
from qdrant_client import models 

client = QdrantClient("http://localhost:6333")

client.create_collection(
    "handm_articles",
    vectors_config={
        "stella_en_400M_v5": models.VectorParams(
            size=1024,
            distance=models.Distance.COSINE,
        ),
        "colbertv2.0": models.VectorParams(
            size=128,
            distance=models.Distance.COSINE,
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM,
            )
        ),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

True

In [63]:
batch_size =  8

def make_mini_batches(lst, batch_size):
    return [lst[i:i + batch_size] for i in range(0, len(lst), batch_size)]

In [64]:
collection_name = "handm_articles"
document_batches = make_mini_batches(dataset, batch_size) 

for batch in tqdm(document_batches, total=len(document_batches)):
    doc_to_embed = [content["product_details"] for content in batch]
    
    dense_embeddings = embedding_model.encode(
        doc_to_embed,
    ).tolist()

    bm25_embeddings = list(bm25_embedding_model.passage_embed(
        doc_to_embed
    ))

    late_interaction_embeddings = list(late_interaction_embedding_model.passage_embed(
        doc_to_embed
    ))


    client.upload_points(
        collection_name,
        points=[
            models.PointStruct(
                id=int(batch[i]["id"]),
                vector={
                    "stella_en_400M_v5": dense_embeddings[i],
                    "colbertv2.0": late_interaction_embeddings[i].tolist(),
                    "bm25": bm25_embeddings[i].as_object()
                },
                payload={
                    "id": int(batch[i]["id"]),
                    "product_details": batch[i]["product_details"],
                    "meta": batch[i]["meta"]
                }
            )
            for i in range(len(batch)) 
        ],
        batch_size=batch_size
    )

 71%|███████   | 9371/13193 [1:53:03<46:06,  1.38it/s]  


ResponseHandlingException: Server disconnected without sending a response.

In [57]:
run_dict = {}
query = "womens crop top"

query_embedding = embedding_model.encode([query], prompt_name="s2p_query")[0].tolist()
sparse_vectors = list(bm25_embedding_model.query_embed([query]))[0]
late_vectors = list(late_interaction_embedding_model.query_embed([query]))[0]

In [58]:
prefetch = [
    models.Prefetch(
        query=query_embedding,
        using="stella_en_400M_v5",
        limit=20
    ),
    models.Prefetch(
        query=models.SparseVector(**sparse_vectors.as_object()),
        using="bm25",
        limit=20
    ),
    models.Prefetch(
        query=late_vectors,
        using="colbertv2.0",
        limit=20
    )
]

In [59]:
results = client.query_points(
    collection_name,
    prefetch=prefetch,
    query=models.FusionQuery(
        fusion=models.Fusion.RRF
    ),
    with_payload=True,
    limit=10
)

In [60]:
for point in results.points:
    print(f"ID: {point.id}, Score: {point.score}")
    print(f"Product Details: {point.payload['product_details']}")
    print(f"Meta: {point.payload['meta']}")
    print("")

ID: 108775051, Score: 0.8
Product Details: Ladieswear Womens Everyday Basics Off White Jersey top with narrow shoulder straps.
Meta: {'color': 'Off White', 'section_name': 'Womens Everyday Basics', 'type': 'Ladieswear'}

ID: 116379047, Score: 0.7909091
Product Details: Ladieswear Womens Everyday Basics Dark Blue Fitted top in soft stretch jersey with a wide neckline and long sleeves.
Meta: {'color': 'Dark Blue', 'section_name': 'Womens Everyday Basics', 'type': 'Ladieswear'}

ID: 108775044, Score: 0.7083334
Product Details: Ladieswear Womens Everyday Basics White Jersey top with narrow shoulder straps.
Meta: {'color': 'White', 'section_name': 'Womens Everyday Basics', 'type': 'Ladieswear'}

ID: 108775015, Score: 0.6944444
Product Details: Ladieswear Womens Everyday Basics Black Jersey top with narrow shoulder straps.
Meta: {'color': 'Black', 'section_name': 'Womens Everyday Basics', 'type': 'Ladieswear'}

ID: 145872051, Score: 0.5
Product Details: Sport Men H&M Sport Black Long-sleeved