# Vector DB Creation

### Prepare Elastic Search Connection

In [22]:
from elasticsearch import Elasticsearch

es = Elasticsearch(
    "https://localhost:9200",
    basic_auth=('elastic', 'vJgb6JQEhOnKJGSqRzg-'),
    ca_certs="/home/fikri/Documents/Apps/elasticsearch-8.12.1/config/certs/http_ca.crt"
)

es.ping()

True

### Preparing the data

In [23]:
import pandas as pd

df = pd.read_csv('../data/raw/csv/myntra_products_catalog.csv').loc[:499]
df.head()

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,DKNY,Unisex,11745,7,"Black and grey printed medium trolley bag, sec...",Black
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,EthnoVogue,Women,5810,7,Beige & Grey made to measure kurta with churid...,Beige
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,SPYKAR,Women,899,7,Pink coloured wash 5-pocket high-rise cropped ...,Pink
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Raymond,Men,5599,5,Blue self-design bandhgala suitBlue self-desig...,Blue
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,Parx,Men,759,5,"Brown and off-white printed casual shirt, has ...",White


In [25]:
df.isna().value_counts()

ProductID  ProductName  ProductBrand  Gender  Price (INR)  NumImages  Description  PrimaryColor
False      False        False         False   False        False      False        False           468
                                                                                   True             32
Name: count, dtype: int64

Because there are not filled column, we will fill the column by "None"

In [26]:
df.fillna("None", inplace=True)

### Word Embedding

We will convert text to vector and we will use BERT Embeddings.

In [34]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

  from .autonotebook import tqdm as notebook_tqdm
modules.json: 100%|██████████| 349/349 [00:00<00:00, 2.29MB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 932kB/s]
README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 42.2MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 416kB/s]
config.json: 100%|██████████| 571/571 [00:00<00:00, 2.75MB/s]
pytorch_model.bin: 100%|██████████| 438M/438M [01:59<00:00, 3.65MB/s] 
tokenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 2.92MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 432kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 937kB/s]
special_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 2.37MB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 1.50MB/s]


In [35]:
from tqdm import tqdm
tqdm.pandas()

In [36]:
df["DescriptionVector"] = df["Description"].progress_apply(lambda x: model.encode(x))

100%|██████████| 500/500 [00:21<00:00, 23.40it/s]


In [37]:
df.head()

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor,DescriptionVector
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,DKNY,Unisex,11745,7,"Black and grey printed medium trolley bag, sec...",Black,"[0.027645899, -0.00263416, -0.003588411, 0.051..."
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,EthnoVogue,Women,5810,7,Beige & Grey made to measure kurta with churid...,Beige,"[-0.024660692, -0.028755333, -0.02033251, 0.03..."
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,SPYKAR,Women,899,7,Pink coloured wash 5-pocket high-rise cropped ...,Pink,"[-0.046943255, 0.081827946, 0.048335165, -0.00..."
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Raymond,Men,5599,5,Blue self-design bandhgala suitBlue self-desig...,Blue,"[-0.015098757, -0.010285391, 0.009487298, -0.0..."
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,Parx,Men,759,5,"Brown and off-white printed casual shirt, has ...",White,"[-0.017746588, 0.0062096245, 0.02181395, 0.026..."


### Create new index in ElasticSearch

In [38]:
from indexMap import indexMap
es.indices.create(index='all_myntra_product', mappings=indexMap)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'all_myntra_product'})

### Insert all the data to the index

In [44]:
records = df.to_dict('records')

In [45]:
for record in records:
    try:
        es.index(index='all_myntra_product', document=record, id=record['ProductID'])
    except Exception as e:
        print(f"Error when inserting {record['ProductID']}:", e)

In [52]:
es.count()

ObjectApiResponse({'count': 503, '_shards': {'total': 14, 'successful': 14, 'skipped': 0, 'failed': 0}})

### Search Data

In [57]:
input_keyword = "Black Shoes"
vector_embedding = model.encode(input_keyword)

query = {
    "field": "DescriptionVector",
    "query_vector": vector_embedding,
    "k": 10,
    "num_candidates": 500,
}

res = es.knn_search(index='all_myntra_product', knn=query, source=['ProductName', 'Description'])

  res = es.knn_search(index='all_myntra_product', knn=query, source=['ProductName', 'Description'])


[{'_index': 'all_myntra_product',
  '_id': '10017157',
  '_score': 0.58104646,
  '_source': {'ProductName': 'Carrera Men Black Sneakers',
   'Description': 'A pair of round-toe black sneakers, has regular styling, lace-up detailSynthetic upperCushioned footbedTextured and patterned outsoleWarranty: 1 monthWarranty provided by brand/manufacturer'}},
 {'_index': 'all_myntra_product',
  '_id': '10007713',
  '_score': 0.5622912,
  '_source': {'ProductName': 'her by invictus Women Black Textured Cushioned Flats',
   'Description': 'A pair of black pointed toe cushioned flats, has regular styling, backstrap detailSynthetic upperCushioned footbedTextured and patterned outsole'}},
 {'_index': 'all_myntra_product',
  '_id': '10007611',
  '_score': 0.56096166,
  '_source': {'ProductName': 'ether Women Black Solid Slip-Ons',
   'Description': 'A pair of round-toe black solid slip-ons, has regular styling, slip-on detailSynthetic upperCushioned footbedTextured and patterned outsoleWarranty: 45 day

### Print the result

In [61]:
for item in res["hits"]["hits"]:
    print(f"Product Name: {item['_source']['ProductName']}")
    print(f"Description: {item['_source']['Description']}")
    print(f"Score: {item['_score']}")
    print("=" * 100) 

Product Name: Carrera Men Black Sneakers
Description: A pair of round-toe black sneakers, has regular styling, lace-up detailSynthetic upperCushioned footbedTextured and patterned outsoleWarranty: 1 monthWarranty provided by brand/manufacturer
Score: 0.58104646
Product Name: her by invictus Women Black Textured Cushioned Flats
Description: A pair of black pointed toe cushioned flats, has regular styling, backstrap detailSynthetic upperCushioned footbedTextured and patterned outsole
Score: 0.5622912
Product Name: ether Women Black Solid Slip-Ons
Description: A pair of round-toe black solid slip-ons, has regular styling, slip-on detailSynthetic upperCushioned footbedTextured and patterned outsoleWarranty: 45 daysWarranty provided by brand/manufacturer
Score: 0.56096166
Product Name: Shoe Couture Women White & Black Applique Detail Sneakers
Description: A pair of round-toe white & black sneakers, has regular styling, lace-up detailSynthetic upper with applique detailCushioned footbedTextu