In [1]:
import pandas as pd
from elasticsearch import Elasticsearch
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Connect to Elasticsearch
es = Elasticsearch([{'host':'localhost','port':9200,'schema':'http'}])

# Load NLP models
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
# Read product data from CSV
data = pd.read_csv("product_feeds.csv")
data.rename(columns={"id":'sku','custom label 1':'category','custom label 2':'subcategory','custom label 3':'maincategory'},inplace=True)
data=data[['title', 'sku', 'price','age group', 'brand','size', 'color','category', 'subcategory', 'maincategory','gender','image link','link','description']]
data.columns

Index(['title', 'sku', 'price', 'age group', 'brand', 'size', 'color',
       'category', 'subcategory', 'maincategory', 'gender', 'image link',
       'link', 'description'],
      dtype='object')

In [7]:
#data.drop([data['price'] == '0.01 INR'],inplace=True)

In [7]:
# Define index settings and mappings
index_name = "search_index"
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "product_name": {"type": "text"},
            "sku": {"type": "text"},
            "price": {"type": "text"},
            "age group": {"type": "text"},
            "brand": {"type": "text"},
            "size": {"type": "text"},
            "color": {"type": "text"},
            "category": {"type": "text"},
            "subcategory": {"type": "text"},
            "maincategory": {"type": "text"},
            "gender": {"type": "text"},
            "image link": {"type": "text"},
            "link": {"type": "text"},
            "description": {"type": "text"},
            'entities':{'type':'text'}
        }
    }
}

# Create the index
es.indices.create(index=index_name, body=index_settings, ignore=400)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'search_index'}

In [10]:
data['text']=data['title']+''+data['gender']+' '+data['brand']+' '+data['color']+' '+data['size']+' '+data['category']+' '+data['subcategory']+' '+data['description']
data.fillna(' ',inplace=True)

In [11]:
# Index product data
for idx, row in data.iterrows():
    product = {
        "product_name": row["title"],
        "sku": row["sku"],
        "price": row["price"],
        "age group": row["age group"],
        "brand": row["brand"],
        "size": row["size"],
        "color": row["color"],
        "category": row["category"],
        "subcategory": row["subcategory"],
        "maincategory": row["maincategory"],
        "gender": row["gender"],
        "image link": row["image link"],
        "link": row["link"],
        "description": row["description"]
        
    }
    
    # Perform NER on attributes
    ner_results = ner_pipeline(row["text"])
    entities = ", ".join([ent["word"] for ent in ner_results])
    product["entities"] = entities
    
    es.index(index=index_name, id=idx, body=product)