In [1]:
import os 
import pandas as pd
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from qdrant_client.models import PointStruct

In [2]:
COLLECTION_NAME      = "tmp-collection"
QDRANT_SERVER_URL    = "http://localhost:6333"

In [3]:
path = "/home/anindya/workspace/opensource/company-ai/data/articles.csv"

df = pd.read_csv(path)
df.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [4]:
df.columns

Index(['article_id', 'product_code', 'prod_name', 'product_type_no',
       'product_type_name', 'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc'],
      dtype='object')

In [5]:
df.iloc[0]

article_id                                                    108775015
product_code                                                     108775
prod_name                                                     Strap top
product_type_no                                                     253
product_type_name                                              Vest top
product_group_name                                   Garment Upper body
graphical_appearance_no                                         1010016
graphical_appearance_name                                         Solid
colour_group_code                                                     9
colour_group_name                                                 Black
perceived_colour_value_id                                             4
perceived_colour_value_name                                        Dark
perceived_colour_master_id                                            5
perceived_colour_master_name                                    

In [18]:
from tqdm.auto import tqdm
from datasets import Dataset 

dataset = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    content = {
        "id": row["article_id"],
        "meta": {
            "color": row["colour_group_name"],
            "type": row["index_name"],
            "section_name": row["section_name"]
        }   
    } 
    details = f"{row['index_name']} {row['section_name']} {row['colour_group_name']} {row['detail_desc']}"
    content["desc"] = details
    dataset.append(content)


dataset = Dataset.from_dict({"train": dataset})

  0%|          | 0/105542 [00:00<?, ?it/s]

100%|██████████| 105542/105542 [00:04<00:00, 25448.20it/s]


In [8]:
# Inserting of embeddings

from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(
    "dunzhang/stella_en_400M_v5", 
    trust_remote_code=True
)

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")
Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
### Usage

query_prompt_name = "s2p_query"
queries = [
    "What are some ways to reduce stress?",
    "What are the benefits of drinking green tea?",
]
# docs do not need any prompts
docs = [
    "There are many effective ways to reduce stress. Some common techniques include deep breathing, meditation, and physical activity. Engaging in hobbies, spending time in nature, and connecting with loved ones can also help alleviate stress. Additionally, setting boundaries, practicing self-care, and learning to say no can prevent stress from building up.",
    "Green tea has been consumed for centuries and is known for its potential health benefits. It contains antioxidants that may help protect the body against damage caused by free radicals. Regular consumption of green tea has been associated with improved heart health, enhanced cognitive function, and a reduced risk of certain types of cancer. The polyphenols in green tea may also have anti-inflammatory and weight loss properties.",
]

# ！The default dimension is 1024, if you need other dimensions, please clone the model and modify `modules.json` to replace `2_Dense_1024` with another dimension, e.g. `2_Dense_256` or `2_Dense_8192` !
model = SentenceTransformer("dunzhang/stella_en_400M_v5", trust_remote_code=True).cuda()
query_embeddings = model.encode(queries, prompt_name=query_prompt_name)
doc_embeddings = model.encode(docs)
print(query_embeddings.shape, doc_embeddings.shape)
# (2, 1024) (2, 1024)

similarities = model.similarity(query_embeddings, doc_embeddings)

In [10]:
from fastembed.sparse.bm25 import Bm25

bm25_embedding_model = Bm25("Qdrant/bm25")

Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00,  1.98it/s]


In [11]:
from fastembed.late_interaction import LateInteractionTextEmbedding


late_interaction_embedding_model = LateInteractionTextEmbedding(
    "colbert-ir/colbertv2.0"
)

Fetching 5 files: 100%|██████████| 5/5 [02:10<00:00, 26.01s/it]


In [13]:
from qdrant_client import QdrantClient, models

client = QdrantClient(QDRANT_SERVER_URL)
client.create_collection(
    COLLECTION_NAME,
    vectors_config={
        "stella_en_400M_v5": models.VectorParams(
            size=1024,
            distance=models.Distance.COSINE
        ),
        "colbertv2.0": models.VectorParams(
            size=431,
            distance=models.Distance.COSINE,
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM,
            )
        ),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

True