In [1]:
import sys
import math
import time
import numpy as np
import pandas as pd
from symspellpy import SymSpell, Verbosity
from textdistance import levenshtein

In [2]:
# Helper function for timing
def debug_time(msg, init):
    print(f"{msg} [{(time.time()-init)*1000:.3f}ms]", file=sys.stderr, flush=True)

In [3]:
# Some Constants
NAME_MAXLENGTH = 20
CYCLE_INPUT_STRING = False  # Should we wrap-around strings if they are shorter than NAME_MAXLENGTH?
                            # I think produces better results to 'control' for differences in string length

In [4]:
# Extract out 'License Name' column from dataset
dataset = "list-of-nea-licensed-eating-establishments.csv"
df = pd.read_csv(dataset)
df = df['licensee_name']
df = df.drop_duplicates()
print(df)

0         REPUBLIC HOTELS & RESORTS LIMITED
2                         M.K. RAMA PTE LTD
3             GRAND PARK PROPERTY PTE. LTD.
4                  MILLENIA PRIVATE LIMITED
6              BCH HOTEL INVESTMENT PTE LTD
                        ...                
36682     AINON BTE BADRI ( AINON BTE ALI )
36683         SYED IBRAHIM BIN PEER MOHAMED
36684                      SAITON BINTE ALI
36685                     AMINAH BTE K OMAR
36686    AISHA BEGAM BINTE MOHAMED MUSTHAPA
Name: licensee_name, Length: 22878, dtype: object


In [5]:
# Sanitization function
def sanitize_alpha(data, maxlen=NAME_MAXLENGTH):
    orig_string = ''.join([c for c in data.lower() if c.isalpha()])
    if (CYCLE_INPUT_STRING):
        cyc_string = orig_string*(math.ceil(maxlen/len(orig_string)))
        return cyc_string[:maxlen]
    else:
        return orig_string[:maxlen]

In [6]:
# Clean up dataframe contents to only include ALPHABETICAL letters in lowercase (26 a-z)
clean_df = df.map(sanitize_alpha)
dictionary = {k:v for k,v in zip(clean_df, df)}
print(clean_df)

0        republichotelsresort
2                mkramapteltd
3        grandparkpropertypte
4        milleniaprivatelimit
6        bchhotelinvestmentpt
                 ...         
36682    ainonbtebadriainonbt
36683    syedibrahimbinpeermo
36684          saitonbinteali
36685          aminahbtekomar
36686    aishabegambintemoham
Name: licensee_name, Length: 22878, dtype: object


In [7]:
sym_spell = SymSpell(max_dictionary_edit_distance=5)

In [8]:
# Add words into the dictionary
b_time = time.time()
for word in clean_df:
    sym_spell.create_dictionary_entry(word, 1)
debug_time("Dictionary build time", b_time)

Dictionary build time [4254.164ms]


In [14]:
def query(q_string):
    q_time = time.time()
    suggestions = sym_spell.lookup(sanitize_alpha(q_string), Verbosity.CLOSEST, max_edit_distance=5)
    debug_time("Query time", q_time)
    for suggestion in suggestions:
        print(f"Suggested Fix: {dictionary[suggestion.term]} | Levenshtein Distance: {suggestion.distance}")

In [15]:
query("rpblic hotlls resot")

Query time [19.484ms]


Suggested Fix: REPUBLIC HOTELS & RESORTS LIMITED | Levenshtein Distance: 4


In [16]:
query("Chee choon bEng")

Query time [14.424ms]


Suggested Fix: LEE CHOON SENG | Levenshtein Distance: 3
Suggested Fix: CHIEW CHOON SENG | Levenshtein Distance: 3
