In [2]:
import pandas as pd
import numpy as np

In [3]:
from elasticsearch import Elasticsearch

In [12]:
es = Elasticsearch(
    hosts=["https://127.0.0.1:9200"],
    basic_auth=("elastic", "4151491"), 
    ca_certs="C:\\Users\\LENOVO\\Downloads\\elasticsearch-8.11.3\\config\\certs\\http_ca.crt"
)

es.ping()

True

### prepare the data

In [5]:
df=pd.read_csv("myntra_products_catalog.csv").loc[:499]
df.head()

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,DKNY,Unisex,11745,7,"Black and grey printed medium trolley bag, sec...",Black
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,EthnoVogue,Women,5810,7,Beige & Grey made to measure kurta with churid...,Beige
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,SPYKAR,Women,899,7,Pink coloured wash 5-pocket high-rise cropped ...,Pink
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Raymond,Men,5599,5,Blue self-design bandhgala suitBlue self-desig...,Blue
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,Parx,Men,759,5,"Brown and off-white printed casual shirt, has ...",White


In [6]:
df.isna().value_counts()

ProductID  ProductName  ProductBrand  Gender  Price (INR)  NumImages  Description  PrimaryColor
False      False        False         False   False        False      False        False           468
                                                                                   True             32
Name: count, dtype: int64

In [7]:
df.fillna("None",inplace=True)

## convert the relevant field to vector using SBERT model

In [13]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-mpnet-base-v2")

.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 1.18MB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 191kB/s]
README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 10.5MB/s]
config.json: 100%|██████████| 571/571 [00:00<?, ?B/s] 
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 58.0kB/s]
data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 423kB/s]
pytorch_model.bin: 100%|██████████| 438M/438M [05:18<00:00, 1.37MB/s] 
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<?, ?B/s]
special_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 240kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.16MB/s]
tokenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 371kB/s]
train_script.py: 100%|██████████| 13.1k/13.1k [00:00<00:00, 13.1MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 764kB/s]
modules.json: 100%|██████████| 349/349 [00:00<?, ?B/s] 


In [15]:
df["DescriptionVector"] = df["Description"].apply(lambda x: model.encode(x))

In [17]:
df.head()

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor,DescriptionVector
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,DKNY,Unisex,11745,7,"Black and grey printed medium trolley bag, sec...",Black,"[0.027645884, -0.0026341688, -0.0035883961, 0...."
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,EthnoVogue,Women,5810,7,Beige & Grey made to measure kurta with churid...,Beige,"[-0.024660701, -0.028755333, -0.0203325, 0.034..."
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,SPYKAR,Women,899,7,Pink coloured wash 5-pocket high-rise cropped ...,Pink,"[-0.046943244, 0.08182793, 0.048335165, -0.000..."
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Raymond,Men,5599,5,Blue self-design bandhgala suitBlue self-desig...,Blue,"[-0.015098748, -0.010285424, 0.009487302, -0.0..."
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,Parx,Men,759,5,"Brown and off-white printed casual shirt, has ...",White,"[-0.0177466, 0.006209646, 0.021813976, 0.02679..."


In [18]:
es.ping()

True

## create new index in elastic search

In [23]:
from indexMapping import indexMapping
es.indices.create(index="all_products", mappings=indexMapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'all_products'})

## ingest the data into index

In [35]:
record_list=df.to_dict("records")

In [36]:
for record in record_list:
    try:
        es.index(index="all_products",document=record,id=record["ProductID"])
    except Exception as e:
        print(e)    

In [37]:
es.count(index="all_products")

ObjectApiResponse({'count': 500, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

## Search the data

In [38]:
input_keyword = "Blue Shoes"
vector_of_input_keyword = model.encode(input_keyword)

query = {
    "field": "DescriptionVector",
    "query_vector": vector_of_input_keyword,
    "k": 1,
    "num_candidates": 500,
}
res = es.knn_search(
    index="all_products", knn=query, source=["ProductName", "Description"]
)
res["hits"]["hits"]

  res=es.knn_search(index="all_products",knn=query,source=["ProductName","Description"])


[{'_index': 'all_products',
  '_id': '10018013',
  '_score': 0.61429423,
  '_source': {'ProductName': 'Puma Men Blue Sneakers',
   'Description': 'A pair of round-toe blue sneakers, has regular styling, lace-up detailTextile upperCushioned footbedTextured and patterned outsoleWarranty: 3 monthsWarranty provided by brand/manufacturer'}}]