In [8]:

import pandas as pd
import numpy as np
import itertools

# Search Name Permuntation (If needed)

In [6]:
phonetic_list = []
sample_1 = ['Shaun', 'Shawn', 'Sean']
sample_2 = ['Lee', 'Li']
sample_3 = ['Lew','Liu','Liew']

phonetic_list.append(sample_1)
phonetic_list.append(sample_2)
phonetic_list.append(sample_3)

In [7]:
phonetic_list

[['Shaun', 'Shawn', 'Sean'], ['Lee', 'Li'], ['Lew', 'Liu', 'Liew']]

In [112]:

def get_all_similar(name, phonetic_list):
    name_list = name.split(" ")
    no_of_words = len(name_list)
    output_list = []
    total_word_list = []
    for i in range(len(name_list)):
        word_found = False
        for similar_names_list in phonetic_list:
            if name_list[i] in similar_names_list:
                word_found = True
                total_word_list.append(similar_names_list)
        if not word_found:
            total_word_list.append([name_list[i]])                     
    for element in itertools.product(*total_word_list):
        output = ''
        for x in range(len(element)):
            if x==0:
                output += element[x]
            else:
                output += ' ' + element[x] 
        output_list.append(output)           
    return output_list
                

In [113]:
get_all_similar("Sean Lee Jun Jie", phonetic_list)

['Shaun Lee Jun Jie',
 'Shaun Li Jun Jie',
 'Shawn Lee Jun Jie',
 'Shawn Li Jun Jie',
 'Sean Lee Jun Jie',
 'Sean Li Jun Jie']

In [115]:
get_all_similar("Sean Tan Li Li", phonetic_list)

['Shaun Tan Lee Lee',
 'Shaun Tan Lee Li',
 'Shaun Tan Li Lee',
 'Shaun Tan Li Li',
 'Shawn Tan Lee Lee',
 'Shawn Tan Lee Li',
 'Shawn Tan Li Lee',
 'Shawn Tan Li Li',
 'Sean Tan Lee Lee',
 'Sean Tan Lee Li',
 'Sean Tan Li Lee',
 'Sean Tan Li Li']

# Name Matching

In [3]:
import jellyfish
from fuzzywuzzy import fuzz
import pandas as pd
df = pd.read_excel("NUS sample names_V2.xlsx", engine="openpyxl")



## Integer based calculations may be too strict and cannot handle wrong orders

In [8]:
print(jellyfish.levenshtein_distance("Sean", "Shawn"))
print(jellyfish.levenshtein_distance("Li", "Lee"))
print(jellyfish.levenshtein_distance("Tim", "Jim"))
print(jellyfish.levenshtein_distance("Teo Cheng Kiat", "Cheng Kiat Teo"))
print(jellyfish.levenshtein_distance("Teo Cheng Kiat", "Cheng Kiat Teo"))
print(jellyfish.levenshtein_distance("Teo Cheng Kiat", "Cheng Kiat Teo"))

2
2
1
8
8
8


## Token Sort Ratio for name matching: wrong orders and phonetics/typos

In [6]:
print(fuzz.ratio("Teo Cheng Kiat", "Cheng Kiat Teo"))
print(fuzz.partial_ratio("Teo Cheng Kiat", "Cheng Kiat Teo"))
print(fuzz.token_sort_ratio("Teo Cheng Kiat", "Cheng Kiat Teo"))

71
71
100


In [10]:
print(fuzz.ratio("Teo Cheng Kiat", "Teo Cheng Keat"))
print(fuzz.partial_ratio("Teo Cheng Kiat", "Teo Cheng Keat"))
print(fuzz.token_sort_ratio("Teo Cheng Kiat", "Teo Cheng Keat"))

93
93
93


In [104]:

def test_name_matching(name1, name2):
    print("\n\n---",name1," AND ",name2, "---")
    print("(Token Sort Ratio) Score: ", fuzz.token_sort_ratio(name1, name2))
    print("(Partial Token Sort Ratio) Score: ", fuzz.partial_token_sort_ratio(name1, name2))
    print("(Soundex) Actual: ", jellyfish.soundex(name1))
    print("(Soundex) Error: ", jellyfish.soundex(name2))
    print("(Metaphone) Actual: ", jellyfish.metaphone(name1))
    print("(Metaphone) Error: ", jellyfish.metaphone(name2))      
    print("(NYSIIS) Actual: ", jellyfish.nysiis(name1))
    print("(NYSIIS) Error: ", jellyfish.nysiis(name2))    
    print("(Codex) Actual: ", jellyfish.match_rating_codex(name1))
    print("(Codex) Error: ", jellyfish.match_rating_codex(name2)) 

In [28]:
phonetics_df = df.loc[df['Type of variation (if any)']=='Spelling/Phonetic']
phonetics_df.head(5)

Unnamed: 0,Name to be screened,Alias name,Date of birth,Gender,Nationality,Type of variation (if any),Actual name
2,Chia Tek Leng,,1960,Male,Singapore,Spelling/Phonetic,Chia Teck Leng
5,Teo Cheng Keat,,,Male,Singapore,Spelling/Phonetic,Teo Cheng Kiat
9,Chong Hock Yan,,1960,Male,Singapore,Spelling/Phonetic,Chong Hock Yen
17,Lage Vivien,,1997,Female,Singapore,Spelling/Phonetic,Lange Vivian
24,Noordin Muhammad Top,,1968-08-11 00:00:00,Male,Malaysia,Spelling/Phonetic,Noordin Mohammad Top


In [53]:
phonetics_master_list = []
for index, row in phonetics_df.iterrows():
    current_entry = []
    current_entry.append(row[0])
    current_entry.append(row[6])
    phonetics_master_list.append(current_entry)

In [54]:
phonetics_master_list

[['Chia Tek Leng', 'Chia Teck Leng'],
 ['Teo Cheng Keat', 'Teo Cheng Kiat'],
 ['Chong Hock Yan', 'Chong Hock Yen'],
 ['Lage Vivien', 'Lange Vivian'],
 ['Noordin Muhammad Top', 'Noordin Mohammad Top'],
 ['Jan Sturzenegger', 'Jen Sturzenegger'],
 ['Rodrygo Duterte Roa', 'Rodrigo Roa Duterte'],
 ['Ameen Imam Boratong', 'Amin Imam Boratong'],
 ['Hose Calida', 'Jose Calida'],
 ['Erwin Tulpho', 'Erwin Tulfo'],
 ['Alan Burisima', 'Alan Purisima'],
 ['Park Chun il', 'Pak Chun il']]

In [55]:
for both_names in phonetics_master_list:
    test_name_matching(both_names[0], both_names[1])




--- Chia Tek Leng  AND  Chia Teck Leng ---
(Token Sort Ratio) Score:  96
(Soundex) Actual:  C324
(Soundex) Error:  C324
(Metaphone) Actual:  X TK LNK
(Metaphone) Error:  X TK LNK
(NYSIIS) Actual:  C
(NYSIIS) Error:  C
(Codex) Actual:  CHTLNG
(Codex) Error:  CHTLNG


--- Teo Cheng Keat  AND  Teo Cheng Kiat ---
(Token Sort Ratio) Score:  93
(Soundex) Actual:  T252
(Soundex) Error:  T252
(Metaphone) Actual:  T XNK KT
(Metaphone) Error:  T XNK KT
(NYSIIS) Actual:  T
(NYSIIS) Error:  T
(Codex) Actual:  TCHGKT
(Codex) Error:  TCHGKT


--- Chong Hock Yan  AND  Chong Hock Yen ---
(Token Sort Ratio) Score:  93
(Soundex) Actual:  C522
(Soundex) Error:  C522
(Metaphone) Actual:  XNK HK YN
(Metaphone) Error:  XNK HK YN
(NYSIIS) Actual:  CANG
(NYSIIS) Error:  CANG
(Codex) Actual:  CHNKYN
(Codex) Error:  CHNKYN


--- Lage Vivien  AND  Lange Vivian ---
(Token Sort Ratio) Score:  87
(Soundex) Actual:  L211
(Soundex) Error:  L521
(Metaphone) Actual:  LJ FFN
(Metaphone) Error:  LNJ FFN
(NYSIIS) Actual

In [92]:
test_name_matching('Benedict Loh', 'Benedict Low')
test_name_matching('Benedict Lim', 'Benedict Low')




--- Benedict Loh  AND  Benedict Low ---
(Token Sort Ratio) Score:  92
(Partial Token Sort Ratio) Score:  92
(Soundex) Actual:  B532
(Soundex) Error:  B532
(Metaphone) Actual:  BNTKT L
(Metaphone) Error:  BNTKT L
(NYSIIS) Actual:  BANADACT
(NYSIIS) Error:  BANADACT
(Codex) Actual:  BNDTLH
(Codex) Error:  BNDTLW


--- Benedict Lim  AND  Benedict Low ---
(Token Sort Ratio) Score:  83
(Partial Token Sort Ratio) Score:  83
(Soundex) Actual:  B532
(Soundex) Error:  B532
(Metaphone) Actual:  BNTKT LM
(Metaphone) Error:  BNTKT L
(NYSIIS) Actual:  BANADACT
(NYSIIS) Error:  BANADACT
(Codex) Actual:  BNDTLM
(Codex) Error:  BNDTLW


In [94]:
test_name_matching('Shawn Li', 'Shawn Lee')
test_name_matching('Sean Lee', 'Shaun Lee')



--- Shawn Li  AND  Shawn Lee ---
(Token Sort Ratio) Score:  82
(Partial Token Sort Ratio) Score:  75
(Soundex) Actual:  S540
(Soundex) Error:  S540
(Metaphone) Actual:  XN L
(Metaphone) Error:  XN L
(NYSIIS) Actual:  SAN
(NYSIIS) Error:  SAN
(Codex) Actual:  SHWNL
(Codex) Error:  SHWNL


--- Sean Lee  AND  Shaun Lee ---
(Token Sort Ratio) Score:  82
(Partial Token Sort Ratio) Score:  75
(Soundex) Actual:  S540
(Soundex) Error:  S540
(Metaphone) Actual:  SN L
(Metaphone) Error:  XN L
(NYSIIS) Actual:  SAN
(NYSIIS) Error:  SAN
(Codex) Actual:  SNL
(Codex) Error:  SHNL


In [74]:
df_50 = df.head(50)

In [102]:
# def name_matching(name1, name2):
#     threshold = 90
#     score1 = fuzz.token_sort_ratio(name1, name2)
#     if score1 >= threshold:
#         pass
#         # let through
#         # print("SUCCESS: " + name1 + ' and ' + name2 + " has a score of " + str(score))
#     else:
#         # print("FAIL: " + name1 + ' and ' + name2 + " has a score of " + str(score))
#         score2 = fuzz.ratio(name1, name2)
#         print("FAILS: " + name1 + ' and ' + name2)
#         print("(Fuzz Ratio) Score: ", fuzz.ratio(name1, name2))
#         print("(Partial Ratio) Score: ", fuzz.partial_ratio(name1, name2))
#         print("(Partial Token Sort Ratio) Score: ", fuzz.partial_token_sort_ratio(name1, name2))
#         print("(Soundex) Actual: ", jellyfish.soundex(name1))
#         print("(Soundex) Error: ", jellyfish.soundex(name2))
#         print("(Metaphone) Actual: ", jellyfish.metaphone(name1))
#         print("(Metaphone) Error: ", jellyfish.metaphone(name2))      
#         print("(NYSIIS) Actual: ", jellyfish.nysiis(name1))
#         print("(NYSIIS) Error: ", jellyfish.nysiis(name2))    
#         print("(Codex) Actual: ", jellyfish.match_rating_codex(name1))
#         print("(Codex) Error: ", jellyfish.match_rating_codex(name2)) 
#         print("\n")

        
        
    # print("\n\n---",name1," AND ",name2, "---")
    # print("(Token Sort Ratio) Score: ", fuzz.token_sort_ratio(name1, name2))
    # print("(Soundex) Actual: ", jellyfish.soundex(name1))
    # print("(Soundex) Error: ", jellyfish.soundex(name2))
    # print("(Metaphone) Actual: ", jellyfish.metaphone(name1))
    # print("(Metaphone) Error: ", jellyfish.metaphone(name2))      
    # print("(NYSIIS) Actual: ", jellyfish.nysiis(name1))
    # print("(NYSIIS) Error: ", jellyfish.nysiis(name2))    
    # print("(Codex) Actual: ", jellyfish.match_rating_codex(name1))
    # print("(Codex) Error: ", jellyfish.match_rating_codex(name2)) 

In [103]:
test_list = []
for index, row in df_50.iterrows():
    current_entry = []
    current_entry.append(row[0])
    current_entry.append(row[6])
    test_list.append(current_entry)
for both_names in test_list:
    name_matching(both_names[0], both_names[1])

FAILS: Lim Oon Quin and Lim Oon Kuin
(Fuzz Ratio) Score:  92
(Partial Ratio) Score:  92
(Partial Token Sort Ratio) Score:  74
(Soundex) Actual:  L552
(Soundex) Error:  L552
(Metaphone) Actual:  LM N KN
(Metaphone) Error:  LM N KN
(NYSIIS) Actual:  LAN
(NYSIIS) Error:  LAN
(Codex) Actual:  LMNQN
(Codex) Error:  LMNKN


FAILS: Lage Vivien and Lange Vivian
(Fuzz Ratio) Score:  87
(Partial Ratio) Score:  82
(Partial Token Sort Ratio) Score:  82
(Soundex) Actual:  L211
(Soundex) Error:  L521
(Metaphone) Actual:  LJ FFN
(Metaphone) Error:  LNJ FFN
(NYSIIS) Actual:  LAG
(NYSIIS) Error:  LANG
(Codex) Actual:  LGVN
(Codex) Error:  LNGVN


FAILS: Laeg Vivian and Lange Vivian
(Fuzz Ratio) Score:  87
(Partial Ratio) Score:  82
(Partial Token Sort Ratio) Score:  82
(Soundex) Actual:  L211
(Soundex) Error:  L521
(Metaphone) Actual:  LK FFN
(Metaphone) Error:  LNJ FFN
(NYSIIS) Actual:  LAG
(NYSIIS) Error:  LANG
(Codex) Actual:  LGVN
(Codex) Error:  LNGVN


FAILS: Mas Salamat Kestari and Mas Selamat K

In [112]:



test_name_matching("Shawn Lee" , "Sean Lee" )



--- Shawn Lee  AND  Sean Lee ---
(Token Sort Ratio) Score:  82
(Partial Token Sort Ratio) Score:  75
(Soundex) Actual:  S540
(Soundex) Error:  S540
(Metaphone) Actual:  XN L
(Metaphone) Error:  SN L
(NYSIIS) Actual:  SAN
(NYSIIS) Error:  SAN
(Codex) Actual:  SHWNL
(Codex) Error:  SNL


In [14]:

def split_name_list(name):
    name = name.lower()
    output = name.split(" ")
    return output

split1 = split_name_list("Shawn Lee")
split2 = split_name_list("Lee Sean")
# split1 = split_name_list("Lage Vivien")
# split2 = split_name_list("Lange Vivian")

In [23]:

def phonetic_comparison(list1, list2):
    meta_list1 = []
    meta_list2 = []
    nysiis_list1 = []
    nysiis_list2 = []
    for name_1 in list1:
        meta_list1.append(jellyfish.metaphone(name_1))
        nysiis_list1.append(jellyfish.nysiis(name_1))
    for name_2 in list2:
        meta_list2.append(jellyfish.metaphone(name_2))
        nysiis_list2.append(jellyfish.nysiis(name_2))
    # print(meta_list1, meta_list2)
    # print(nysiis_list1, nysiis_list2)
    if (set(meta_list1) == set(meta_list2)) or (set(nysiis_list1) == set(nysiis_list2)):
        return True
    else:
        return False
    # print(set(meta_list1) == set(meta_list2))
    
phonetic_comparison(split1, split2)

True

In [6]:

names_dict = {
    'gan' : ['kan','ban'],
    'shawn' : ['sean', 'shon', 'shaun'],
    'huang' : ['wang'],
    'michael' : ['mikael'],
    'joko' : ['djoko'],
    # 'budy' : ['budi'],
    'jang' : ['chang'],
    'song' : ['sung', 'seong'],
    'jo' : ['cho'],
    'jun' : ['chun'],
    'yong' : ['ryong'],
    'jong' : ['jung'],
    'hyeok' : ['hyok'],
    'mun' : ['moon'],
    'zhi' : ['zih'],
    'qian' : ['chian'],
    'kuin' : ['quin'],
    
}

In [19]:
def preprocess_name(names_dict, word):
    for key, value in names_dict.items():
        if word in value:
            return key
    else:
        return word
    
def stitch_name(list1):
    output = ''
    for x in range(len(list1)):
        if x==0:
            output += list1[x]
        else:
            output += ' ' + list1[x]
    return output

In [20]:
stitch_name(['lionel', 'lew'])

'lionel lew'

In [10]:



# Algo flow
# 1) Input 2 names - DONE

def test_compiled_algo(name1, name2, names_dict):
    
    def split_name_list(name):
        name = name.lower()
        output = name.split(" ")
        return output
    def preprocess_name(names_dict, word):
        for key, value in names_dict.items():
            if word in value:
                return key
        else:
            return word
        
    def stitch_name(list1):
        output = ''
        for x in range(len(list1)):
            if x==0:
                output += list1[x]
            else:
                output += ' ' + list1[x]
        return output
    
    def phonetic_comparison(list1, list2):
        meta_list1 = []
        meta_list2 = []
        nysiis_list1 = []
        nysiis_list2 = []
        for name_1 in list1:
            meta_list1.append(jellyfish.metaphone(name_1))
            nysiis_list1.append(jellyfish.nysiis(name_1))
        for name_2 in list2:
            meta_list2.append(jellyfish.metaphone(name_2))
            nysiis_list2.append(jellyfish.nysiis(name_2))
        # print(meta_list1, meta_list2)
        # print(nysiis_list1, nysiis_list2)
        if (set(meta_list1) == set(meta_list2)) or (set(nysiis_list1) == set(nysiis_list2)):
            return True
        else:
            return False
    
    
    threshold = 89
# 2) Split names - DONE
    split_list_1 = split_name_list(name1)
    split_list_2 = split_name_list(name2)    
    
# 3) Check each name with dictionary and map it to standardised name - CURRENT
    for i in range(len(split_list_1)):
        split_list_1[i] = preprocess_name(names_dict, split_list_1[i])
    for i in range(len(split_list_2)):
        split_list_2[i] = preprocess_name(names_dict, split_list_2[i])
    
    stitched_name1 = stitch_name(split_list_1)
    stitched_name2 = stitch_name(split_list_2)
    
# 3) 1st layer of testing: Token Sort Ratio with threshold - DONE
    score1 = fuzz.token_sort_ratio(stitched_name1, stitched_name2)
    if score1 >= threshold:
        # score_list.append(score1)
        print("Pass first layer Token Sort Ratio score of: " + str(score1) + " for names: " + name1 + ' with ' + name2)
        # do something
# 4) 2nd layer of testing - Metaphone and NYSIIS phonetic encoding - DONE
    else: 
        matched_phonetic = phonetic_comparison(split_list_1, split_list_2)
        if matched_phonetic:
            print("Pass second layer Phonetic Matching for names: " + name1 + ' with ' + name2 + 'with a mean score of 93 (? mean from all those that passed the fuzzy threshold OR just give them threshold score')
            # Not sure whether to do fuzzy again for score but should be 100 score since pronounce exactly the same?
        else: 
            print("Failed to match, score of " + str(score1) + ": " + name1 + ' with ' + name2)
    
    try:
        return score1
    except:
        pass
                
        
        
        
# Problems: 
# 1) Dont think it can handle big typos
# 2) Is it assumed all Bern is Bernard, all Jess is Jessica etc?

In [13]:
# df_20 = df.sample(20)
test_list = []
for index, row in df.iterrows():
    current_entry = []
    current_entry.append(row[0])
    current_entry.append(row[6])
    test_list.append(current_entry)
# print(test_list)
score_list = []
for both_names in test_list:
    try:
        valid = test_compiled_algo(both_names[0], both_names[1], names_dict)
        score_list.append(valid)
    except:
        pass
    

Pass first layer Token Sort Ratio score of: 100 for names: Chia Teck Leng with Chia Teck Leng
Pass first layer Token Sort Ratio score of: 100 for names: Teck Leng Chia with Chia Teck Leng
Pass first layer Token Sort Ratio score of: 96 for names: Chia Tek Leng with Chia Teck Leng
Pass first layer Token Sort Ratio score of: 100 for names: Teo Cheng Kiat with Teo Cheng Kiat
Pass first layer Token Sort Ratio score of: 100 for names: Cheng Kiat Teo with Teo Cheng Kiat
Pass first layer Token Sort Ratio score of: 93 for names: Teo Cheng Keat with Teo Cheng Kiat
Pass first layer Token Sort Ratio score of: 96 for names: Teo Chen Kiat with Teo Cheng Kiat
Pass first layer Token Sort Ratio score of: 100 for names: Chong Hock Yen with Chong Hock Yen
Pass first layer Token Sort Ratio score of: 100 for names: Hock Yen Chong with Chong Hock Yen
Pass first layer Token Sort Ratio score of: 93 for names: Chong Hock Yan with Chong Hock Yen
Pass first layer Token Sort Ratio score of: 100 for names: Lim Oon

In [16]:
print(sum(score_list)/len(score_list))

93.38541666666667


In [1]:
 fruit_info =('fruit': 'apple', 'count': 2,'price': 3.5 ).dict()

SyntaxError: invalid syntax (1117248818.py, line 1)