In [19]:
#from datetime import datetime
#!pip install elasticsearch --user
from elasticsearch import Elasticsearch
#!pip install python-greeklish --user
from greeklish.converter import Converter
import pandas as pd

In [20]:
# create an object for greeklish translation
conv = Converter(max_expansions=1)

In [21]:
# read the file with the spitishop products and their details
df_products = pd.read_pickle('data/df_products.pkl')

In [22]:
# create a column with the title having changed some letters based on greek_letters_dict
greek_letters_dict = {'ω':'ο'}
df_products['Product_name2'] = df_products.apply(lambda x: x["Product_name"].lower().translate(str.maketrans(greek_letters_dict)), axis=1)

# create a column with the greeklish title
df_products['Product_name_eng'] = df_products.apply(lambda x: conv.convert(x["Product_name2"])[0], axis=1)

# keep only specific columns
df_products = df_products[['Code', 'Price', 'Product_name', 'Url', 'Brand', 'Product_name_eng', 'name', 'img_url', 'freq']]

# drop rows with nans
df_products = df_products.dropna()

# drop duplicates
df_products = df_products.drop_duplicates(subset=['Code'], keep='first')

In [23]:
df_products.head(2)

Unnamed: 0,Code,Price,Product_name,Url,Brand,Product_name_eng,name,img_url,freq
0,29866,59.0,Μπουρνούζι Guy Laroche Linda Lilac,https://www.spitishop.gr/μπουρνούζι-guy-laroch...,Guy Laroche,mpournouzi guy laroche linda lilac,Μπάνιο,https://www.spitishop.gr/26083/μπουρνούζι-guy-...,11.0
1,29868,59.0,Μπουρνούζι Guy Laroche Linda Red,https://www.spitishop.gr/μπουρνούζι-guy-laroch...,Guy Laroche,mpournouzi guy laroche linda red,Μπάνιο,https://www.spitishop.gr/26085/μπουρνούζι-guy-...,9.0


In [24]:
df_products.name.unique()

array(['Μπάνιο', 'Παιδικά', 'Κρεβατοκάμαρα', 'Βρεφικά', 'Σαλόνι',
       'Κουζίνα - Τραπεζαρία', 'Μόδα Γάμου', 'Θαλάσσης', 'Διακόσμηση',
       'Χριστουγεννιάτικα', 'Κουρτίνες', 'Χαλιά', 'Προώθηση', 'Κήπος'],
      dtype=object)

In [25]:
df_products.shape

(15050, 9)

In [26]:
# create a list of dicts with the product details
products = []
tmp = df_products.apply(lambda x: products.append({'title_eng':x['Product_name_eng'],
                                                         'title_init':x['Product_name'],
                                                         'url':x['Url'],
                                                         'Price':x['Price'],
                                                         'Brand':x['Brand'],
                                                         'Category':x['name'],
                                                         'img_url':x['img_url'],
                                                         'Code':x['Code'],
                                                         'Popularity':x['freq']
                                                  }), axis=1)

In [27]:
# connect to elasticsearch / create an Elasticsearch object

# create an Elasticsearch object
#es = Elasticsearch()

def connect_elasticsearch():
    _es = None
    #_es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    _es = Elasticsearch()
    if _es.ping():
        print('- Connected')
    else:
        print('- Problem with the connection')
    return _es

es = connect_elasticsearch()

- Connected


In [12]:
def create_index(es_object, index_name='dimostest1'):
    
    # index settings
#     settings = {
#           "settings": {
#             "analysis": {
#               "analyzer": {
#                 "my_analyzer": { 
#                   "type": "standard", # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-standard-analyzer.html
#                   "stopwords": ["sentoni", "the", 'mpournouzi'],
#                   #"stopwords_path": "stopwords/my_stopwords.txt"
#                 }
#               }
#             }
#           },
#         #"mappings": {}
#     }

    # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html
    settings = {
                "settings": {
                    "index" : {
                        "analysis" : {
                            "analyzer" : {
                                "my_analyzer" : {
                                    "tokenizer" : "standard",
                                    "filter" : ["my_stop", "synonym"]
                                }
                            },
                            "filter" : {
                                "my_stop": {
                                    "type" : "stop",
                                    "stopwords": ["to", "kai"]
                                },
                                "synonym" : {
                                    "type" : "synonym",
                                    "lenient": True,
                                    "synonyms" : ["ypnodomatio, bedroom => krebatokamara"]
                                }
                            }
                        }
                    }
                }
            }
    
    try:
        #if not es_object.indices.exists(index_name):
        # Ignore 400 means to ignore "Index Already Exist" error.
        es_object.indices.create(index=index_name, ignore=400, body=settings)
        print('Created Index')
    except Exception as ex:
        print('Error: ', str(ex))

create_index(es, 'dimostest1')

Created Index


In [13]:
# settings = {
#                 "settings": {
#                     "index" : {
#                         "analysis" : {
#                             "analyzer" : {
#                                 "synonym" : {
#                                     "tokenizer" : "standard",
#                                     "filter" : ["my_stop", "synonym"]
#                                 }
#                             },
#                             "filter" : {
#                                 "my_stop": {
#                                     "type" : "stop",
#                                     "stopwords": ["sentoni", "the", 'mpournouzi']
#                                 },
#                                 "synonym" : {
#                                     "type" : "synonym",
#                                     "lenient": True,
#                                     "synonyms" : ["foo, bar => baz"]
#                                 }
#                             }
#                         }
#                     }
#                 }
#             }


# es.indices.create(index='dimostest1', ignore=400, body=settings)

In [28]:
# check if the index exists
es.indices.exists('dimostest1')

True

In [11]:
# delete the index
#es.indices.delete(index='dimostest1', ignore=[400, 404])

In [29]:
es.indices.get_mapping(index='dimostest1')

{'dimostest1': {'mappings': {'properties': {'Category': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'Code': {'type': 'long'},
    'Popularity': {'type': 'float'},
    'Price': {'type': 'float'},
    'title_eng': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'title_init': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}}}}}

In [17]:
# add products to the index
i = 1
for product in products[0:]:
    es.index(index='dimostest1',
             doc_type='product',
             id=i,
             body=product,
            )
    i += 1

In [18]:
#delete a product from the index
#es.delete(index='dimostest1', doc_type='product', id=1)

In [13]:
# get a document based on its id
res=es.get(index='dimostest1',doc_type='product',id=2)
res

{'_index': 'dimostest1',
 '_type': 'product',
 '_id': '2',
 '_version': 1,
 '_seq_no': 1,
 '_primary_term': 1,
 'found': True,
 '_source': {'title_eng': 'mpournouzi guy laroche linda red',
  'title_init': 'Μπουρνούζι Guy Laroche Linda Red',
  'url': 'https://www.spitishop.gr/μπουρνούζι-guy-laroche-linda-red-29868.html',
  'Price': 59.0,
  'Brand': 'Guy Laroche',
  'Category': 'Μπάνιο',
  'img_url': 'https://www.spitishop.gr/26085/μπουρνούζι-guy-laroche-linda-red.jpg',
  'Code': 29868,
  'Popularity': 9.0}}

In [20]:
# https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html

# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html

In [21]:
# settings = {
#       "settings": {
#         "analysis": {
#           "analyzer": {
#             "my_analyzer": { 
#               "type": "standard", 
#                 "stopwords": ["sentoni", "the", 'mpournouzi']
#             }
#           }
#         }
#       },
#     #"mappings": {}
# }

In [22]:
# settings = {
#     "settings": {
#         "index" : {
#             "analysis" : {
#                 "analyzer" : {
#                     "synonym" : {
#                         "tokenizer" : "standard",
#                         "filter" : ["my_stop", "synonym"]
#                     }
#                 },
#                 "filter" : {
#                     "my_stop": {
#                         "type" : "stop",
#                         "stopwords": ["sentoni", "the", 'mpournouzi']
#                     },
#                     "synonym" : {
#                         "type" : "synonym",
#                         "lenient": True,
#                         "synonyms" : ["kouvertatatatata, keftedoperda => kouberta"]
#                     }
#                 }
#             }
#         }
#     }
# }

In [30]:
def get_results(query, category):
    
    if query.isdigit():
        print('- You are searching a product code')
        df_product = df_products[df_products.Code==int(query)]
        
        if len(df_product)>0:
            print('-- The product code was found')
            results = [{'_score': 100,
                             '_source': {'Brand': df_product.iloc[0]['Brand'],
                                          'Category': df_product.iloc[0]['name'],
                                          'Code': df_product.iloc[0]['Code'],
                                          'Price': df_product.iloc[0]['Price'],
                                          'img_url': df_product.iloc[0]['img_url'],
                                          'title_eng': df_product.iloc[0]['Product_name_eng'],
                                          'title_init': df_product.iloc[0]['Product_name'],
                                          'url': df_product.iloc[0]['Url'],
                                        'Popularity': df_product.iloc[0]['freq']}}]
            return results
            
    
    # convert the query to lowercase and some letters based on greek_letters_dict
    query = query.lower().translate(str.maketrans(greek_letters_dict))
    # convert it to greeklish
    query = conv.convert(query)[0]
    print(query)
                                    
    # search_flag defines whether the results are procuced by elasticsearch or based on popularity
    search_flag = 0 
    
    # search the index based on the query    
    if category=='All':
        # search the index based on the query
        results = es.search(index='dimostest1', body = { "from" : 0, "size" : 1000,
            'query':{
                'bool': {
                  'must': [
                                {'match':{'title_eng':{"query" : query,
                                                       "fuzziness": "AUTO:3,5", # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
                                                       #"fuzziness": 2,
                                                       #"fuzzy_transpositions":False,
                                                        "analyzer": "my_analyzer"

                                                      }}},
                                #{'match': {'Category': category}},
                           ],
#                   "filter": [ 
#                                 {"term":{"Category":category}},
#                             ]
                    
                }
            }
        },                  
                           )
    else:
        results = es.search(index='dimostest1', body = { "from" : 0, "size" : 1000,
            'query':{
                'bool': {
                  'must': [
                                {'match':{'title_eng':{"query" : query,
                                                       "fuzziness": "AUTO:3,5", # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
                                                       #"fuzziness": 2,
                                                       #"fuzzy_transpositions":False,
                                                        "analyzer": "my_analyzer"

                                                      }}},
                                {'match': {'Category': category}},
                           ],
#                   "filter": [ 
#                                 {"term":{"Category":category}},
#                             ]
                    
                }
            }
        },                  
                           )

    # list of results
    results = results['hits']['hits'][0:]
    
    #print(results)

#     results2 = []
#     results2.append(results[0:5])
#     results2.append(results[5:10])
#     results2.append(results[10:15])
#     results2.append(results[15:20])
#     results2.append(results[20:25])
#     results2.append(results[25:30])
    
    #print(results2[0][0])
    
    # if the search query doesnt have resutls in the specified category, then check in all categories
    if (len(results)==0) & (category!='All'):
        print(' - Searching regardless the specified category')
        results = es.search(index='dimostest1', body = { "from" : 0, "size" : 1000,
            'query':{
                'bool': {
                  'must': [
                                {'match':{'title_eng':{"query" : query,
                                                       "fuzziness": "AUTO:3,5", # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
                                                       #"fuzziness": 2,
                                                       #"fuzzy_transpositions":False,
                                                        "analyzer": "my_analyzer"

                                                      }}},
                                #{'match': {'Category': category}},
                           ],
#                   "filter": [ 
#                                 {"term":{"Category":category}},
#                             ]
                    
                }
            }
        },                  
                           )
        results = results['hits']['hits'][0:]
    
    return results

In [45]:
query = 'sdfg'
# convert it to greeklish and lowercase
#query = conv.convert(query)[0].lower()
category = 'All'

print('- Searching for: "', query, '" in the category: "', category, '".')
res = get_results(query, category)

print('- ', len(res), ' results were returned.')

- Searching for: " sdfg " in the category: " All ".
sdfg
-  0  results were returned.


In [42]:
df_products.head(2)

Unnamed: 0,Code,Price,Product_name,Url,Brand,Product_name_eng,name,img_url,freq
0,29866,59.0,Μπουρνούζι Guy Laroche Linda Lilac,https://www.spitishop.gr/μπουρνούζι-guy-laroch...,Guy Laroche,mpournouzi guy laroche linda lilac,Μπάνιο,https://www.spitishop.gr/26083/μπουρνούζι-guy-...,11.0
1,29868,59.0,Μπουρνούζι Guy Laroche Linda Red,https://www.spitishop.gr/μπουρνούζι-guy-laroch...,Guy Laroche,mpournouzi guy laroche linda red,Μπάνιο,https://www.spitishop.gr/26085/μπουρνούζι-guy-...,9.0


In [46]:
res[0]

IndexError: list index out of range

In [47]:
results = []
for i in range(len(res)):
    #print(res[i]['_source']['title_init']+' - ', res[i]['_source']['Price'])
    result = res[i]
    
    results.append({
        'Code':result['_source']['Code'],
        'Product_name':result['_source']['title_init'],
                   'title_eng':result['_source']['title_eng'],
                    'Category':result['_source']['Category'],
                    'Price':result['_source']['Price'],
                    'score':result['_score'],
                    'img_url':result['_source']['img_url'],
                    'Url':result['_source']['url'],
                    'Popularity':result['_source']['Popularity'],
                   })
    
df_results = pd.DataFrame(results)

if len(df_results) > 0:
    # convert list of dicts to pandas dataframe 
    #df_results = pd.DataFrame(results)
    
    # keep only results with relevance score higher that 4
    #df_results = df_results[df_results.score>4]
    
    # sort results based on relevance and Price/popularity
    df_results = df_results.sort_values(by=['score', 'Popularity'], ascending=[False, False])
    
    # search_flag defines whether the results are procuced by elasticsearch or based on popularity
    search_flag = 0

if len(df_results) == 0:
    print('- No results were returned via elasticsearch. So, return popular products.')
    df_results = df_products.sort_values(by=['freq'], ascending=[False]).head(30)
    #df_results = df_results[['Category', 'Price', 'title_eng', 'title_init']]
    
    # set the flag==1 - the resutls are produced based on popularity
    search_flag = 1
    
print(search_flag)

- No results were returned via elasticsearch. So, return popular products.
1


In [18]:
df_results

Unnamed: 0,Category,Code,Popularity,Price,Product_name,Url,img_url,score,title_eng
0,Κρεβατοκάμαρα,77533,17.0,34.20,Πάπλωμα Μονό Nima Balance,https://www.spitishop.gr/πάπλωμα-μονό-nima-bal...,https://www.spitishop.gr/223885/πάπλωμα-μονό-n...,2.737326,paploma mono nima balance
1,Κρεβατοκάμαρα,77534,10.0,45.00,Πάπλωμα Υπέρδιπλο Nima Balance,https://www.spitishop.gr/πάπλωμα-υπέρδιπλο-nim...,https://www.spitishop.gr/223886/πάπλωμα-υπέρδι...,2.737326,paploma yperdiplo nima balance
8,Βρεφικά,126971,3.0,35.70,Πάπλωμα Κούνιας Rythmos Hope,https://www.spitishop.gr/πάπλωμα-κούνιας-rythm...,https://www.spitishop.gr/235468/πάπλωμα-κούνια...,2.737326,paploma kounias rythmos hope
5,Βρεφικά,113958,2.0,35.70,Πάπλωμα Κούνιας Rythmos Flying,https://www.spitishop.gr/πάπλωμα-κούνιας-rythm...,https://www.spitishop.gr/190617/πάπλωμα-κούνια...,2.737326,paploma kounias rythmos flying
6,Βρεφικά,114416,1.0,35.70,Πάπλωμα Κούνιας Rythmos Malo,https://www.spitishop.gr/πάπλωμα-κούνιας-rythm...,https://www.spitishop.gr/191550/πάπλωμα-κούνια...,2.737326,paploma kounias rythmos malo
7,Βρεφικά,114554,1.0,35.70,Πάπλωμα Κούνιας Rythmos Elefante,https://www.spitishop.gr/πάπλωμα-κούνιας-rythm...,https://www.spitishop.gr/191801/πάπλωμα-κούνια...,2.737326,paploma kounias rythmos elefante
2,Βρεφικά,53120,0.0,33.15,Πάπλωμα Κούνιας Viopros Πάτρικ,https://www.spitishop.gr/πάπλωμα-κούνιας-viopr...,https://www.spitishop.gr/58615/πάπλωμα-κούνιας...,2.737326,paploma kounias viopros patrik
3,Βρεφικά,37647,0.0,28.70,Πάπλωμα Κούνιας Viopros Αντζέλ,https://www.spitishop.gr/πάπλωμα-κούνιας-viopr...,https://www.spitishop.gr/37909/πάπλωμα-κούνιας...,2.737326,paploma kounias viopros antzel
4,Βρεφικά,103024,0.0,35.70,Πάπλωμα Κούνιας Rythmos Nautic,https://www.spitishop.gr/πάπλωμα-κούνιας-rythm...,https://www.spitishop.gr/164583/πάπλωμα-κούνια...,2.737326,paploma kounias rythmos nautic
32,Κρεβατοκάμαρα,31749,21.0,35.10,Πάπλωμα Μονό Nef-Nef Hollowfiber,https://www.spitishop.gr/πάπλωμα-μονό-nef-nef-...,https://www.spitishop.gr/97842/πάπλωμα-μονό-ne...,2.552816,paploma mono nef-nef hollowfiber


In [28]:
# https://www.elastic.co/guide/en/elasticsearch/reference/current/search-as-you-type.html