In [1]:
#import the libraries

# https://pypi.org/project/elasticsearch/
# !pip install elasticsearch
from elasticsearch import Elasticsearch

# https://pypi.org/project/python-greeklish/
# !pip install python-greeklish
from greeklish.converter import Converter

import pandas as pd

## -- Read the the dataset with the products and their details

In [2]:
# read the file with the products and their details
df_products = pd.read_pickle('data/df_products_details.pkl')

In [3]:
# prin 5 random products
df_products.sample(5)

Unnamed: 0,Code,Product_name,Category,SubCategory,url,Price
344,345,Nokia 5.1 Plus 32GB Dual,Mobiles,Mobile_phone,https://www.bestprice.gr/item/2155455212/nokia...,130.0
969,12947,Συμβατή μπαταρία high quality 5200mAh 10.8-11....,Laptop_pc,Laptop_battery,https://www.bestprice.gr/item/18357051/symbath...,49.99
1514,1515,Hisense F23 16GB,Mobiles,Mobile_phone,https://www.bestprice.gr/item/2155254136/hisen...,124.99
1586,6446,Zealot S27 Blue,Mobiles,Portable_speaker,https://www.bestprice.gr/item/2155686647/zealo...,85.24
323,30982,2-Power 10.8V 2200mAh Lithium-Ion (Li-Ion)(DRN...,Photograph,Photograph_battery,https://www.bestprice.gr/item/44051110/2-power...,28.05


In [4]:
print('There are', len(df_products), 'products in total. So, we can imagine that that we have an eshop with', len(df_products), 'products.')

There are 32324 products in total. So, we can imagine that that we have an eshop with 32324 products.


## -- Preprocess the dataset

Some of the following preprocessing steps may not be necessary for another dataset

The dataset contains some product names written in greek. So, will translate them to greeklish (https://en.wikipedia.org/wiki/Greeklish).

In [5]:
# create a column with the Product_name having changed some letters based on letters_mapping_dict
# we can add more letters in the following dictionary
letters_mapping_dict = {'ω':'ο'}
df_products['Product_name2'] = df_products.apply(lambda x: x["Product_name"].lower().translate(str.maketrans(letters_mapping_dict)), axis=1)

# create an object for greeklish translation
conv = Converter(max_expansions=1)
# use the above to create a column with the greeklish title
df_products['Product_name_eng'] = df_products.apply(lambda x: conv.convert(x["Product_name2"])[0], axis=1)

# keep only specific columns
df_products = df_products[['Code', 'Product_name', 'Product_name_eng', 'Category', 'SubCategory', 'url', 'Price']]

# drop rows with nans if any
df_products = df_products.dropna()

In [6]:
# prin 5 random products after preprocessing
df_products.sample(5)

Unnamed: 0,Code,Product_name,Product_name_eng,Category,SubCategory,url,Price
26,19355,LG 27UD58P,lg 27ud58p,Desktop_pc,Desktop_monitors,https://www.bestprice.gr/item/2154959851/lg-27...,301.75
1834,21163,iiyama S3820HSB-B1,iiyama s3820hsb-b1,Desktop_pc,Desktop_monitors,https://www.bestprice.gr/item/2155566685/iiyam...,0.0
879,5739,Platinet PMG14 Blue,platinet pmg14 blue,Mobiles,Portable_speaker,https://www.bestprice.gr/item/2155661568/plati...,23.9
898,4027,Hoco J38 Comprehensive White 10000mAh,hoco j38 comprehensive white 10000mah,Mobiles,Power_bank,https://www.bestprice.gr/item/2155676838/hoco-...,18.0
554,555,OnePlus 6 256GB Dual,oneplus 6 256gb dual,Mobiles,Mobile_phone,https://www.bestprice.gr/item/2155346731/onepl...,550.0


## -- Create ElasticSearch index

In [7]:
# create a list of dicts with the product details
# this list will be used in order to add products to the elastic search index
products_dicts = []
tmp = df_products.apply(lambda x: products_dicts.append({'title_eng':x['Product_name_eng'],
                                                         'title_init':x['Product_name'],
                                                         'url':x['url'],
                                                         'Price':x['Price'],
                                                         'Category':x['Category'],
                                                         'SubCategory':x['SubCategory'],
                                                         'Code':x['Code'],
                                                  }), axis=1)

In [8]:
# connect to elasticsearch / create an Elasticsearch object
def connect_elasticsearch():
    _es = None
    #_es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    _es = Elasticsearch()
    if _es.ping():
        print('- Connected')
    else:
        print('- Problem with the connection')
    return _es

es = connect_elasticsearch()

- Connected


In [9]:
# create the index 'my_new_index'
def create_index(es_object, index_name='my_new_index'):

    # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html
    settings = {
                "settings": {
                    "index" : {
                        "analysis" : {
                            "analyzer" : {
                                "my_new_analyzer" : {
                                    "tokenizer" : "standard",
                                    "filter" : ["my_stop", "synonym"]
                                }
                            },
                            "filter" : {
                                "my_stop": {
                                    "type" : "stop",
                                    "stopwords": ["to", "kai"] # we add more stopwords here
                                },
                                "synonym" : {
                                    "type" : "synonym",
                                    "lenient": True,
                                    "synonyms" : ["mobilephone, cellphone => smartphone"] # we add more synonyms here
                                }
                            }
                        }
                    }
                }
            }
    
    try:
        #if not es_object.indices.exists(index_name):
        # Ignore 400 means to ignore "Index Already Exist" error.
        es_object.indices.create(index=index_name, ignore=400, body=settings)
        print('Created Index')
    except Exception as ex:
        print('Error: ', str(ex))

create_index(es, 'my_new_index')

Created Index


In [10]:
# check if the index exists
es.indices.exists('my_new_index')

True

In [11]:
# delete the index if you want
#es.indices.delete(index='my_new_index', ignore=[400, 404])

In [12]:
# check the mappings if specified
es.indices.get_mapping(index='my_new_index')

{'my_new_index': {'mappings': {}}}

In [13]:
# add products to the index
i = 1
for product in products_dicts[0:]:
    es.index(index='my_new_index',
             doc_type='product',
             id=i,
             body=product,
            )
    i += 1

In [14]:
# delete a product from the index based on the id (id=1 means the first product)
# es.delete(index='my_new_index', doc_type='product', id=1)

In [15]:
# get a document based on its id
res=es.get(index='my_new_index',doc_type='product',id=2)
res

{'_id': '2',
 '_index': 'my_new_index',
 '_primary_term': 1,
 '_seq_no': 1,
 '_source': {'Category': 'Mobiles',
  'Code': 2,
  'Price': 279.0,
  'SubCategory': 'Mobile_phone',
  'title_eng': 'samsung galaxy a70 6gb 128gb dual',
  'title_init': 'Samsung Galaxy A70 6GB 128GB Dual',
  'url': 'https://www.bestprice.gr/item/2155621215/samsung-galaxy-a70-6gb-128gb-dual.html'},
 '_type': 'product',
 '_version': 1,
 'found': True}

## --  Create the funtion that searches the index based on a query 

In [16]:
# https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html

In [17]:
def get_results(query, category):
    
    if query.isdigit():
        print('- You are searching a product code')
        df_product = df_products[df_products.Code==int(query)]
        
        if len(df_product)>0:
            print('-- The product code was found')
            results = [{'_score': 100,
                             '_source': {'Category': df_product.iloc[0]['Category'],
                                          'Code': df_product.iloc[0]['Code'],
                                          'Price': df_product.iloc[0]['Price'],
                                          'title_eng': df_product.iloc[0]['Product_name_eng'],
                                          'title_init': df_product.iloc[0]['Product_name'],
                                          'url': df_product.iloc[0]['url']
                                        }}]
            return results
            
    
    # convert the query to lowercase and some letters based on letters_mapping_dict
    query = query.lower().translate(str.maketrans(letters_mapping_dict))
    # convert it to greeklish
    query = conv.convert(query)[0]
    print(query)
                                    
    # search_flag defines whether the results are procuced by elasticsearch or based on popularity
    search_flag = 0 
    
    # search the index based on the query    
    if category=='All':
        # search the index based on the query
        results = es.search(index='my_new_index', body = { "from" : 0, "size" : 1000,
            'query':{
                'bool': {
                  'must': [
                                {'match':{'title_eng':{"query" : query,
                                                       "fuzziness": "AUTO:3,5", # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
                                                       #"fuzziness": 2,
                                                       #"fuzzy_transpositions":False,
                                                        "analyzer": "my_new_analyzer"

                                                      }}},
                           ],

                    
                }
            }
        },                  
                           )
    else:
        results = es.search(index='my_new_index', body = { "from" : 0, "size" : 1000,
            'query':{
                'bool': {
                  'must': [
                                {'match':{'title_eng':{"query" : query,
                                                       "fuzziness": "AUTO:3,5", # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
                                                       #"fuzziness": 2,
                                                       #"fuzzy_transpositions":False,
                                                        "analyzer": "my_new_analyzer"

                                                      }}},
                                {'match': {'Category': category}},
                           ],
                    
                }
            }
        },                  
                           )

    # list of results
    results = results['hits']['hits'][0:]
    
    # if the search query doesnt have resutls in the specified category, then check in all categories
    if (len(results)==0) & (category!='All'):
        print(' - Searching regardless the specified category')
        results = es.search(index='my_new_index', body = { "from" : 0, "size" : 1000,
            'query':{
                'bool': {
                  'must': [
                                {'match':{'title_eng':{"query" : query,
                                                       "fuzziness": "AUTO:3,5", # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
                                                       #"fuzziness": 2,
                                                       #"fuzzy_transpositions":False,
                                                        "analyzer": "my_new_analyzer"

                                                      }}},
                           ],
                    
                }
            }
        },                  
                           )
        results = results['hits']['hits'][0:]
    
    return results

In [18]:
# search 'Samsung Galaxi' products in the category 'Mobiles'
query = 'Samsung Galaxi'

category = 'Mobiles'

print('- Searching for: "', query, '" in the category: "', category, '".')
res = get_results(query, category)

print('- ', len(res), ' results were returned.')

- Searching for: " Samsung Galaxi " in the category: " Mobiles ".
samsung galaxi
-  209  results were returned.


In [19]:
# check the first result
res[0]

{'_id': '1613',
 '_index': 'my_new_index',
 '_score': 11.05337,
 '_source': {'Category': 'Mobiles',
  'Code': 1613,
  'Price': 104.3,
  'SubCategory': 'Bluetooth',
  'title_eng': 'samsung galaxy buds',
  'title_init': 'Samsung Galaxy Buds',
  'url': 'https://www.bestprice.gr/item/2155532856/samsung-galaxy-buds.html'},
 '_type': 'product'}

In [20]:
results = []
for i in range(len(res)):
    #print(res[i]['_source']['title_init']+' - ', res[i]['_source']['Price'])
    result = res[i]
    
    results.append({
        'Code':result['_source']['Code'],
        'Product_name':result['_source']['title_init'],
                   'title_eng':result['_source']['title_eng'],
                    'Category':result['_source']['Category'],
                    'Price':result['_source']['Price'],
                    'score':result['_score'],
                    'Url':result['_source']['url'],
                   })
    
df_results = pd.DataFrame(results)

if len(df_results) > 0:
    # keep only results with relevance score higher that 4
    #df_results = df_results[df_results.score>4]
    
    # sort results based on relevance and Price
    df_results = df_results.sort_values(by=['score', 'Price'], ascending=[False, False])
    
    # search_flag defines whether the results are procuced by elasticsearch or based on popularity
    search_flag = 0

if len(df_results) == 0:
    print('- No results were returned via elasticsearch. So, return popular products.')
    df_results = df_products.sort_values(by=['freq'], ascending=[False]).head(30)
    #df_results = df_results[['Category', 'Price', 'title_eng', 'title_init']]
    
    # set the flag==1 - the resutls are produced based on popularity
    search_flag = 1
    
#print(search_flag)

# print the dataframe with the results
df_results

Unnamed: 0,Category,Code,Price,Product_name,Url,score,title_eng
0,Mobiles,1613,104.30,Samsung Galaxy Buds,https://www.bestprice.gr/item/2155532856/samsu...,11.053370,samsung galaxy buds
11,Mobiles,23,610.00,Samsung Galaxy S10 512GB Dual,https://www.bestprice.gr/item/2155531586/samsu...,9.965252,samsung galaxy s10 512gb dual
22,Mobiles,48,610.00,Samsung Galaxy S10 512GB Dual,https://www.bestprice.gr/item/2155531586/samsu...,9.965252,samsung galaxy s10 512gb dual
33,Mobiles,73,610.00,Samsung Galaxy S10 512GB Dual,https://www.bestprice.gr/item/2155531586/samsu...,9.965252,samsung galaxy s10 512gb dual
44,Mobiles,98,610.00,Samsung Galaxy S10 512GB Dual,https://www.bestprice.gr/item/2155531586/samsu...,9.965252,samsung galaxy s10 512gb dual
...,...,...,...,...,...,...,...
201,Mobiles,4097,3.39,ΦΟΡΤΙΣΤΗΣ ΑΝΑΓΚΗΣ SAMSUNG D820 VOLTE-TEL LIGHT...,https://www.bestprice.gr/item/40719616/fortist...,4.905380,fortisths anagkhs samsung d820 volte-tel light...
202,Mobiles,4121,3.39,ΦΟΡΤΙΣΤΗΣ ΑΝΑΓΚΗΣ SAMSUNG D820 VOLTE-TEL LIGHT...,https://www.bestprice.gr/item/40719616/fortist...,4.905380,fortisths anagkhs samsung d820 volte-tel light...
206,Mobiles,1406,39.90,Evelatus Samson Dual,https://www.bestprice.gr/item/2155496477/evela...,4.790743,evelatus samson dual
207,Mobiles,2230,7.90,S6/S7 ΓΝΗΣΙΑ ΑΚΟΥΣΤΙΚΑ HANDS FREE ΛΕΥΚΑ ΓΙΑ SA...,https://www.bestprice.gr/item/38121745/s6s7-gn...,4.627721,s6/s7 gnhsia akoustika hands free leuka gia sa...
