# Offline model evaluation and comparison to the current SEBE spellchecker solution

In [2]:
# ! pip install BeautifulSoup
# !cd .. && conda env update --name spellchecker --file environment.yml --prune

In [18]:
import getpass
import sys
import os
sys.path.append(os.path.join('..','src'))

import pandas as pd
import numpy as np
from google.cloud import bigquery

import config
import utils
import data_utils
import evaluation
from text_cleaner import TextCleaner

%load_ext autoreload
%autoreload 2
pd.set_option('display.max_colwidth', 1000)

In [19]:
client = bigquery.Client(project=config.SEBE_GCP_PROJECT)
textCleaner = TextCleaner()

In [20]:
# Test data params
NB_TEST_SAMPLES = 500
TRACKING_DATA_DAY = '20220321' 

# Model arguments
USER_NAME = '' # Provide the User Name who trained the model
MODEL_TRAIN_DATE = '1653655366'
ARC_VERSION = 'BILSTM'

# Product index arguments
PROD_INDEX_DATE = '1653655311'
PROD_INDEX_EXT = '202204'

### Download model and product index

In [None]:
evaluation.pass_version(USER_NAME, MODEL_TRAIN_DATE, ARC_VERSION)

prefix = os.path.join(USER_NAME, config.PROD_INDEX_DIR, PROD_INDEX_DATE,
                      config.PROCESSED_PROD_INDEX_FILE)
pre_process_path = os.path.join(config.ARTIFACTS_DIR, prefix)

utils.download_from_bucket(bucket_name=config.DSC_BUCKET, prefix=prefix,
    dest_folder=config.ARTIFACTS_DIR, project=config.DSC_GCP_PROJECT)

### Load the inverted index data

In [28]:
words = utils.load_from_json(os.path.join(pre_process_path))
inv_index_keys = words.keys()
list(inv_index_keys)[:10]

['universalrahmen',
 'aufbau',
 'tischladestation',
 'elastikreifen',
 'children',
 'laufkugel',
 'geldscheinpruefstift',
 'schallged',
 'avant',
 'schallgeber']

### Extracting tracking data for a particular day on the serach result page

In [21]:
actual_data = data_utils.get_search_queries_at(
    client, TRACKING_DATA_DAY, NB_TEST_SAMPLES)
actual_data.tail(5)

Unnamed: 0,prod_desc_ger,prod_desc_eng,search_keyword
495,LAVOR SCL Quick 36B Bodenreinigungsmasch,Lavor 8.518.0003 1 pc(s),1586744
496,SCHALTBARE STECKDOSE IP44,Sygonix SY-4673756 In-line socket with s,schaltbare steckdose
497,Apple MacBook Pro 13 M1/8C/8C/8GB/256/Sp,Apple MacBook Pro 13 MacBook 12 33.8 cm,laptop 15 6 zoll
498,"7"" Touchscreen, IPS, 1024x600, HDMI, VGA",Joy-it LCD-7-3 LCD 1 pc(s),joy-it display
499,BROTHER BAND TZE-S231 12MM WEISS/SCHWAR,Brother tape TZE-S231 12mm wit/zwart,schriftband brother


Note: If the token's length is less than 4 characters, then no need to correct with the spellchecker
Reasons: It is very difficult to correct words shorter than 4 chars also SEBE will be only sending us words longer than 4 chars

In [None]:
to_correct = 'playstation'
res = data_utils.qualify_search_keyword(to_correct, inv_index_keys)
assert len(res) == 1
assert res[0][0] == to_correct
# assert editdistance(res[0][1], res[0][0]) == 3
res

In [12]:
test_data = actual_data.copy()
test_data['nlp_search_keyword'] = test_data['search_keyword'].apply(lambda x: textCleaner.clean(x))
test_data['test_tuples'] = test_data['nlp_search_keyword'].apply(
    lambda x: data_utils.qualify_search_keyword(x, inv_index_keys))
test_data.drop(['prod_desc_ger', 'prod_desc_eng', 'search_keyword'], axis=1, inplace=True)
test_data.replace(to_replace='None', value=np.nan, inplace=True)
test_data.dropna(inplace=True)
test_data.tail()
# correct word, misspelled, edit distance, flag=1|0(1: correct it)
# e.g. [(widerstaende, wuderstsene, 3, 1)

Unnamed: 0,nlp_search_keyword,test_tuples
491,phoenix contact trio ps 2g 1ac 24dc 3 c2,"[(phoenix, hoenix, 1, 1), (contact, contat, 1, 1), (trio, 0, 0, 0), (ps, 0, 0, 0), (2g, 0, 0, 0), (1ac, 0, 0, 0), (24dc, 0, 0, 0), (3, 0, 0, 0), (c2, 0, 0, 0)]"
492,fritzbox 6590,"[(fritzbox, fritgbox, 2, 1), (6590, 0, 0, 0)]"
496,,[]
497,mini mikrofon,"[(mini, 0, 0, 0), (mikrofon, mikofokn, 2, 1)]"
498,varta micro aaa power akku,"[(varta, vagrta, 1, 1), (micro, mictro, 1, 1), (aaa, 0, 0, 0), (power, öower, 1, 1), (akku, 0, 0, 0)]"


In [13]:
test_data.shape

(354, 2)

In [14]:
print(test_data.iloc[1])

nlp_search_keyword                                                               samsung galaxy tab s7
test_tuples           [(samsung, samsong, 1, 1), (galaxy, galax, 1, 1), (tab, 0, 0, 0), (s7, 0, 0, 0)]
Name: 1, dtype: object


# Performing the evaluation
## Note: set the API Key variable

In [15]:
API_KEY=''

In [16]:
assert API_KEY != ''

In [17]:
API_ENDPOINT = \
    'https://conrad-nonprod-gcp-cloud.apigee.net/search/1/v3/search/de/de/b2b?apikey={}'.format(API_KEY)

In [18]:
spellcheckers_outputs = test_data.copy()
spellcheckers_outputs.iloc[1]

nlp_search_keyword                                                               samsung galaxy tab s7
test_tuples           [(samsung, samsong, 1, 1), (galaxy, galax, 1, 1), (tab, 0, 0, 0), (s7, 0, 0, 0)]
Name: 1, dtype: object

## Apply ML spellchecker corrections

In [19]:
%time
spellcheckers_outputs['ml_sp'] = spellcheckers_outputs['test_tuples'].apply(
    lambda x: evaluation.ml_spellchecker(x))
spellcheckers_outputs.head()

Wall time: 0 ns


Unnamed: 0,nlp_search_keyword,test_tuples,ml_sp
0,modellbau flugzeuge,"[(modellbau, omdellbau, 2, 1), (flugzeuge, 0, 0, 0)]","[{'actual': 'modellbau', 'noisy': 'omdellbau', 'edit_dist': 2, 'ml_predicted': 'modellbau'}]"
1,samsung galaxy tab s7,"[(samsung, samsong, 1, 1), (galaxy, galax, 1, 1), (tab, 0, 0, 0), (s7, 0, 0, 0)]","[{'actual': 'samsung', 'noisy': 'samsong', 'edit_dist': 1, 'ml_predicted': 'samsung'}, {'actual': 'galaxy', 'noisy': 'galax', 'edit_dist': 1, 'ml_predicted': 'galax'}]"
2,ethernet switch,"[(ethernet, fethekrnet, 2, 1), (switch, witch, 1, 1)]","[{'actual': 'ethernet', 'noisy': 'fethekrnet', 'edit_dist': 2, 'ml_predicted': 'wettercenter'}, {'actual': 'switch', 'noisy': 'witch', 'edit_dist': 1, 'ml_predicted': 'witch'}]"
3,modul 2 berker,"[(modul, moul, 1, 1), (2, 0, 0, 0), (berker, mberker, 1, 1)]","[{'actual': 'modul', 'noisy': 'moul', 'edit_dist': 1, 'ml_predicted': 'mul'}, {'actual': 'berker', 'noisy': 'mberker', 'edit_dist': 1, 'ml_predicted': 'berker'}]"
4,logitech webcam,"[(logitech, loigtech, 2, 1), (webcam, webcav, 1, 1)]","[{'actual': 'logitech', 'noisy': 'loigtech', 'edit_dist': 2, 'ml_predicted': 'logitech'}, {'actual': 'webcam', 'noisy': 'webcav', 'edit_dist': 1, 'ml_predicted': 'webcam'}]"


## Apply Search Backend misspelling corrections

In [20]:
%time
spellcheckers_outputs['be_sp'] = spellcheckers_outputs['test_tuples'].apply(
    lambda x: evaluation.be_spellchecker(x, API_ENDPOINT))
spellcheckers_outputs['be_sp'].head()

Wall time: 0 ns


0                                                                                    [{'actual': 'modellbau', 'noisy': 'omdellbau', 'edit_dist': 2, 'be_predicted': 'modellbau'}]
1          [{'actual': 'samsung', 'noisy': 'samsong', 'edit_dist': 1, 'be_predicted': 'samsung'}, {'actual': 'galaxy', 'noisy': 'galax', 'edit_dist': 1, 'be_predicted': 'None'}]
2         [{'actual': 'ethernet', 'noisy': 'fethekrnet', 'edit_dist': 2, 'be_predicted': 'None'}, {'actual': 'switch', 'noisy': 'witch', 'edit_dist': 1, 'be_predicted': 'None'}]
3             [{'actual': 'modul', 'noisy': 'moul', 'edit_dist': 1, 'be_predicted': 'modul'}, {'actual': 'berker', 'noisy': 'mberker', 'edit_dist': 1, 'be_predicted': 'berker'}]
4    [{'actual': 'logitech', 'noisy': 'loigtech', 'edit_dist': 2, 'be_predicted': 'logitech'}, {'actual': 'webcam', 'noisy': 'webcav', 'edit_dist': 1, 'be_predicted': 'webdav'}]
Name: be_sp, dtype: object

In [21]:
ml_vs_be = spellcheckers_outputs[['ml_sp', 'be_sp']]
ml_vs_be = ml_vs_be.apply(lambda x: x.explode() if x.name in ['ml_sp', 'be_sp'] else x)
ml_vs_be = ml_vs_be.dropna()
ml_vs_be = ml_vs_be.reset_index(drop=True)
ml_vs_be = pd.concat([ml_vs_be["be_sp"].apply(pd.Series), ml_vs_be["ml_sp"].apply(pd.Series)], axis=1)
ml_vs_be = ml_vs_be.T.drop_duplicates().T

### Splitting results based on edit distance in both spellchecker ML and BE

In [22]:
result_ed3 = ml_vs_be[ml_vs_be['edit_dist'] == 3]
result_ed2 = ml_vs_be[ml_vs_be['edit_dist'] == 2]
result_ed1 = ml_vs_be[ml_vs_be['edit_dist'] == 1]

In [23]:
result_ed3.head()

Unnamed: 0,actual,noisy,edit_dist,be_predicted,ml_predicted
10,heizluefter,heizlufeter,3,heizluefter,heizluefter
21,drahtwiderstand,drahtwidersatnd,3,drahtwiderstand,drahtwiderstand
22,krippenbeleuchtung,kripenbeeuchung,3,,krippenbeleuchtung
28,neonroehren,noenroehren,3,neonroehren,neonroehren
44,ueberwachungskameras,uebfrwachurngskacmeras,3,,ueberwachungskameras


In [None]:
columns = ['actual', 'be_predicted', 'ml_predicted']

result_ed1['label_be'], result_ed1['label_ml'] = zip(*result_ed1.apply(lambda x: evaluation.create_label(x[columns]), axis='columns'))
result_ed2['label_be'], result_ed2['label_ml'] = zip(*result_ed2.apply(lambda x: evaluation.create_label(x[columns]), axis='columns'))
result_ed3['label_be'], result_ed3['label_ml'] = zip(*result_ed3.apply(lambda x: evaluation.create_label(x[columns]), axis='columns'))

In [25]:
result_ed3.head()

Unnamed: 0,actual,noisy,edit_dist,be_predicted,ml_predicted,label_be,label_ml
10,heizluefter,heizlufeter,3,heizluefter,heizluefter,1,1
21,drahtwiderstand,drahtwidersatnd,3,drahtwiderstand,drahtwiderstand,1,1
22,krippenbeleuchtung,kripenbeeuchung,3,,krippenbeleuchtung,0,1
28,neonroehren,noenroehren,3,neonroehren,neonroehren,1,1
44,ueberwachungskameras,uebfrwachurngskacmeras,3,,ueberwachungskameras,0,1


In [26]:
def get_results(results_edit_dist, edit_dist):
    val = {'Edit Distance': edit_dist}
    for model_name in ['ml', 'be']:
        res = round(len(
            results_edit_dist[results_edit_dist[f'label_{model_name}'] == 1]
        ) / len(results_edit_dist), 2)
        val[model_name.upper()] = res
        print(f'Performance of {model_name.upper()} Spellchecker over ed {edit_dist} = {res}')
    return val

In [27]:
dict_res = []
val = get_results(result_ed1, 1)
dict_res.append(val)

val = get_results(result_ed2, 2)
dict_res.append(val)

val = get_results(result_ed3, 3)
dict_res.append(val)

Performance of ML Spellchecker over ed 1 = 0.77
Performance of BE Spellchecker over ed 1 = 0.54
Performance of ML Spellchecker over ed 2 = 0.87
Performance of BE Spellchecker over ed 2 = 0.44
Performance of ML Spellchecker over ed 3 = 0.91
Performance of BE Spellchecker over ed 3 = 0.5


In [28]:
pd.DataFrame(dict_res)

Unnamed: 0,Edit Distance,ML,BE
0,1,0.77,0.54
1,2,0.87,0.44
2,3,0.91,0.5
