In [1]:
#from datetime import datetime
#!pip install elasticsearch --user
from elasticsearch import Elasticsearch
#!pip install python-greeklish --user
from greeklish.converter import Converter
import pandas as pd

from ipywidgets import interact, Dropdown, Text # front end widgets 

In [2]:
# read the dataset that is created every day
#df_products = pd.read_pickle('/home/ubuntu/Spitishop/Spitishop_RE/search_products_api.pkl')
df_products = pd.read_pickle("data/df_products_new.pkl")

df_products.columns = ['Code', 'Price', 'title_init', 
                       'title_eng', 'Category', 'Popularity',
                       'categories', 'categories_text', 'categ_eng', 
                       'attributes','attributes_text', 'attr_eng', 
                       'brand_name', 'reference']


products_search = df_products.copy()

## -- Create a new elastic search index with the new dataset

In [3]:
products = df_products[['title_eng', 'title_init', 'Price', 
                        'Category', 'Code', 'Popularity', 
                        'categ_eng', 'attr_eng', 'reference']].to_dict('records')

In [4]:
# connect to elasticsearch / create an Elasticsearch object
def connect_elasticsearch():
    _es = None
    _es = Elasticsearch()
    if _es.ping():
        print('- Connected')
    else:
        print('- Cannot connect to ElasticSearch')
    return _es

es = connect_elasticsearch()

- Connected


In [5]:
## delete the index
es.indices.delete(index='spitishop_testing', ignore=[400, 404])

{'acknowledged': True}

In [6]:
# the index creation in ES is similar to the DB building in RDBMS
# a database named spitishop_testing is created 
# http://localhost:9200/spitishop_testing
def create_index(es_object, index_name='spitishop_testing'):

    # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html
    settings = {
                "settings": {
                    "index" : {
                        "analysis" : {
                            "analyzer" : {
                                "my_analyzer" : {
                                    "tokenizer" : "standard",
                                    "filter" : ["my_stop", "synonym"]
                                }
                            },
                            "filter" : {
                                "my_stop": {
                                    "type" : "stop",
                                    "stopwords": ["to", "kai"]
                                },
                                "synonym" : {
                                    "type" : "synonym",
                                    "lenient": True,
                                    "synonyms" : ["ypnodomatio, bedroom => krebatokamara"]
                                }
                            }
                        }
                    }
                },
            }
    
    try:
        #if not es_object.indices.exists(index_name):
        # Ignore 400 means to ignore "Index Already Exist" error.
        es_object.indices.create(index=index_name, ignore=400, **settings)
        print('Created Index')
    except Exception as ex:
        print('Error: ', str(ex))

In [7]:
create_index(es, 'spitishop_testing')

Created Index


In [8]:
%%time

# add products to the index
for i, product in enumerate(products[0:]):
    es.index(index='spitishop_testing',
             id=i,
             document=product,
            )


Wall time: 7min 1s


In [17]:
print(i)

29222


In [9]:
#http://localhost:9200/spitishop_testing/product/_search?q=5205495445618
def get_results_spitishop(query, category,exact=False):
    return_size = 5
    try:
        # first check the case that the query is just a reference id of a specific number 
        # if the query is the reference number, we need the exact match
        df_product = products_search[products_search.reference==query]
        if len(df_product)>0:
            #print('-- The product code was found')
            df_product = df_product[['Code', 'freq']]
            df_product['score'] = 1
            df_product.columns = ['Code', 'Popularity', 'score']
            results = df_product.to_dict('records')

            return results, 'elastic search'   

        # convert the query to lowercase and some letters based on greek_letters_dict
        greek_letters_dict = {'ω':'ο'}
        query = query.lower().translate(str.maketrans(greek_letters_dict))
        # convert it to greeklish
        # create an object for greeklish translation
        conv = Converter(max_expansions=1)
        query = conv.convert(query)[0]

        # search_flag defines whether the results are procuced by elasticsearch or based on popularity
        search_flag = 0 

        # search the index based on the query    
        if category=='All':
            if exact == True:
                print("Exact match")
                # search the index based on the query
                results = es.search(index='spitishop_testing', size=return_size, **{

                          "query": {
                            "match": {
                              "title_eng": {  
                                "query": query,
                                "operator":"AND"
                              }
                            }
                          }
                                                                      })


                # if no results are found, return nothing
                if len(results['hits']['hits'][0:])==0:
                    return 'No match'

            else:    
                # search the index based on the query
                # search the query inside 'title_eng', 'categ_eng' & 'attr_eng' and give different boost in each field
                # https://www.elastic.co/guide/en/elasticsearch/guide/current/multi-query-strings.html
                results = es.search(index='spitishop_testing',size=return_size, **{

                  "query": {
                    "bool": {
                      "should": [
                        { "match": { 
                            "title_eng":  {
                              "query": query,
                              "fuzziness": "AUTO:3,5",
                              "analyzer": "my_analyzer",  
                              "boost": 9
                        }}},
                        { "match": { 
                            "categ_eng":  {
                              "query": query,
                              "fuzziness": "AUTO:3,5",
                              "analyzer": "my_analyzer",                             
                              "boost": 2
                        }}},
                        { "match": { 
                            "attr_eng":  {
                              "query": query,
                              "fuzziness": "AUTO:3,5",
                              "analyzer": "my_analyzer",                            
                              "boost": 1
                        }}},
                      ]
                    }
                  }                                                                   
                },                  
                                   )
        else: # if category != All
            results = es.search(index='spitishop_testing', size=return_size, **{
                'query':{
                    'bool': {
                      'must': [
                                    {'match':{'title_eng':{"query" : query,
                                                           "fuzziness": "AUTO:3,5", # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
                                                           #"fuzziness": 2,
                                                           #"fuzzy_transpositions":False,
                                                            "analyzer": "my_analyzer"

                                                          }}},
                                    {'match': {'Category': category}},
                               ],
        #                   "filter": [ 
        #                                 {"term":{"Category":category}},
        #                             ]

                    }
                }
            },                  
                               )

        # list of results
        res = results['hits']['hits'][0:]

        # if the search query doesnt have results in the specified category, then check in all categories
        if (len(res)==0) & (category!='All'):
            print(' - Searching regardless the specified category')
            results = es.search(index='spitishop_testing', size=return_size, **{
                'query':{
                    'bool': {
                      'must': [
                                    {'match':{'title_eng':{"query" : query,
                                                           "fuzziness": "AUTO:3,5", # https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness
                                                           #"fuzziness": 2,
                                                           #"fuzzy_transpositions":False,
                                                            "analyzer": "my_analyzer"

                                                          }}},
                                    #{'match': {'Category': category}},
                               ],
        #                   "filter": [ 
        #                                 {"term":{"Category":category}},
        #                             ]

                    }
                }
            },                  
                               )
            res = results['hits']['hits'][0:]

        results = []
        for i in range(len(res)):
            #print(res[i]['_source']['title_init']+' - ', res[i]['_source']['Price'])
            result = res[i]

            results.append({
                'Code':result['_source']['Code'],
                'Product_name':result['_source']['title_init'],
                           'title_eng':result['_source']['title_eng'],
                            #'Category':result['_source']['Category'],
                            #'Price':result['_source']['Price'],
                            'score':result['_score'],
                            #'img_url':result['_source']['img_url'],
                            #'Url':result['_source']['url'],
                            'Popularity':result['_source']['Popularity'],
                            'categories':result['_source']['categ_eng'],
                            'attributes':result['_source']['attr_eng'],
                           })

        df_results = pd.DataFrame(results)

        if len(df_results) > 0:
            # convert list of dicts to pandas dataframe 
            #df_results = pd.DataFrame(results)

            # keep only results with relevance score higher that 4
            #df_results = df_results[df_results.score>4]

            # sort results based on relevance and Price/popularity
            df_results = df_results.sort_values(by=['score', 'Popularity'], ascending=[False, False])

            # search_flag defines whether the results are procuced by elasticsearch or based on popularity
            search_flag = 'elastic search'

        if len(df_results) == 0:
            #print('- No results were returned via elasticsearch. So, return popular products.')
            df_results = products_search.sort_values(by=['freq'], ascending=[False]).head(10)
            df_results = df_results[['Code', 'freq']]
            df_results['score'] = 0
            df_results.columns = ['Code', 'Popularity', 'score']

            # set the flag==1 - the resutls are produced based on popularity
            search_flag = 'popularity'

        results = df_results.to_dict('records')

        #     results2 = []
        #     for i in range(200):
        #         results2.append(results[i*5:5+(i*5)])

        return results, search_flag
    except:
        pass

In [10]:
# result = es.search(
#     index="spitishop_testing",
#     **{"query": {"match_all": {}}})

In [11]:
# Pass the query dictionary to the 'body' parameter of the
# client's Search() method, and have it return results:

# result = es.search(index="spitishop_testing",
#                    **{"query": {"bool": {"must": {"match": {
#                        "title_eng": 'mpournouzi'}}}}}, size=10)

In [12]:
# result['hits']['hits'][0:]

In [13]:
category_list=products_search.Category.unique().tolist()
category_list.append('All')
category_list.sort()

In [14]:
chosen_category=Dropdown(
    options=category_list,
    value='All',
    description='Κατηγορία:',
)

In [15]:
search_term=Text(
  # value='Αναζήτηση...',
    placeholder='Αναζήτηση...',
    description='Προϊόν:',
    disabled=False
)

In [16]:
interact(get_results_spitishop, query= search_term, category=chosen_category);

interactive(children=(Text(value='', description='Προϊόν:', placeholder='Αναζήτηση...'), Dropdown(description=…

In [18]:
fwtis = {'title_eng': 'Fwtis extend company',
 'title_init': '',
 'Price': 100.0,
 'Category': 'Εταιρεία',
 'Code': 12345,
 'Popularity': 10.0,
 'categ_eng': 'IT',
 'attr_eng': 'Extend',
 'reference': 'H.1300616'}

In [19]:
resp = es.index(index='spitishop_testing',
                id=29223,
                document=fwtis,
               )

In [20]:
resp = es.update(index='spitishop_testing',
                id=29223,
                doc={'title_eng': 'Fwtis SLEED company', 'attr_eng': 'SLEED'},
               )

In [21]:
resp = es.delete(index='spitishop_testing',
                id=29223)