In [80]:
import numpy as np
import pandas as pd

In [81]:
import textdistance
import rapidfuzz
from rapidfuzz import process
import whoswho
from whoswho import who

In [82]:
matching = pd.read_csv('ML_Proj/matching.csv', index_col = 0)

### Edit-Based methods

In [83]:
#jaro-winkler distance
matching['Jaro'] = [textdistance.jaro.similarity(og, match) for 
                    og, match in zip(matching['Original Name'], matching['Matches'])]

In [84]:
#hamming distance
matching['Hamming Distance'] = [textdistance.hamming.distance(og, match) for 
                                og, match in zip(matching['Original Name'], matching['Matches'])]
matching['Hamming Distance Normalized'] = [textdistance.hamming.normalized_distance(og, match) for 
                                           og, match in zip(matching['Original Name'], matching['Matches'])]

In [85]:
#fuzzy string matching process score
matching['Fuzzy Process'] = [process.extract(og, [match])[0][1] for 
                             og, match in zip(matching['Original Name'], matching['Matches'])]

### Sequence-Based methods

In [86]:
#longest common subsequence similarity
matching['LCSSeq'] = [textdistance.lcsseq.similarity(og, match) for 
                      og, match in zip(matching['Original Name'], matching['Matches'])]
matching['LCSSeq Normalized'] = [textdistance.lcsseq.normalized_similarity(og, match) for 
                                 og, match in zip(matching['Original Name'], matching['Matches'])]

In [87]:
#longest common substring similarity
matching['LCSStr'] = [textdistance.lcsstr.similarity(og, match) for 
                      og, match in zip(matching['Original Name'], matching['Matches'])]
matching['LCSStr Normalized'] = [textdistance.lcsstr.normalized_similarity(og, match) for 
                                 og, match in zip(matching['Original Name'], matching['Matches'])]

In [88]:
#ratcliff obershelp similarity
matching['Ratcliff-Obershelp'] = [textdistance.ratcliff_obershelp.similarity(og, match) for 
                                  og, match in zip(matching['Original Name'], matching['Matches'])]

### Sound Based Methods

In [89]:
#longest common subsequence similarity
matching['MRA'] = [textdistance.mra.similarity(og, match) for 
                      og, match in zip(matching['Original Name'], matching['Matches'])]
matching['MRA Normalized'] = [textdistance.mra.normalized_similarity(og, match) for 
                              og, match in zip(matching['Original Name'], matching['Matches'])]

### Supplemental methods

In [90]:
# whos who
matching['Whos Who'] = [int(who.match(og, match)) for og, match in zip(matching['Original Name'], matching['Matches'])]
matching['Whos Who Score'] = [who.ratio(og, match) for og, match in zip(matching['Original Name'], matching['Matches'])]

In [91]:
# details embedded in names
prefixsuffix = set(['deceased', 'honorable', 'esqr', 'trustees', 'his', 'inlaws', 'and', 'son', 'of'])
matching['prefixsuffix'] = [len(set(match.split(" ")).intersection(prefixsuffix))>0 for match in matching['Matches']]
namesuffix = ['jr', 'sr', 'jr.', 'sr.', '1d', '1st', '2d', '2nd', '3d', '3rd', '4d', '4th']
matching['namesuffix'] = [len(set(match.split(" ")).intersection(namesuffix))>0 for match in matching['Matches']]

In [92]:
#which letters are different
col_difs = ["".join(sorted([m.lower() for m in match if m.lower() not in 
                            list(set([m.lower() for m in match]).intersection(set([o.lower() for o in og])))]))
            for og, match in zip(matching['Original Name'], matching['Matches'])]
matching['no differing letters'] = pd.Series([l == "" for l in col_difs]).apply(lambda x: int(x))
matching['differing e'] = pd.Series([l == "e" for l in col_difs]).apply(lambda x: int(x))
matching['differing s'] = pd.Series([l == "s" for l in col_difs]).apply(lambda x: int(x))
matching['differing c'] = pd.Series([l == "c" for l in col_difs]).apply(lambda x: int(x))

In [93]:
#name frequency
def findFrequency(wordlst, countlst):
    pcts = []
    for word in wordlst:
        val = sum([w == word for w in countlst])/len(countlst)
        pcts.append(val)
    return np.mean(pcts)
total_name_list = [name for namelist in list(matching['Original Name'].apply(lambda x: x.split(" "))) for name in namelist]
total_name_list.extend([name for namelist in list(matching['Matches'].apply(lambda x: x.split(" "))) for name in namelist])
matching['frequency'] = [findFrequency(match.split(" "), total_name_list) for match in matching['Matches']]

In [94]:
# match contains digit
matching['Contains Digit'] = pd.Series([any([char.isdigit() for char in match]) for match in matching['Matches']]).apply(lambda x: int(x))

In [95]:
#number of different words
matching['Num Dif Words'] = [len(set(match.split(" ")).intersection(set(og.split(" "))))
                             for og, match in zip(matching['Original Name'], matching['Matches'])]
matching['Num Dif Words Normalized'] = [len(set(match.split(" ")).intersection(set(og.split(" "))))/len(match.split(" ")) 
                                        for og, match in zip(matching['Original Name'], matching['Matches'])]

In [96]:
#where the differences were located (first, last, in between)
matching['First Word Dif'] = pd.Series([og.split(" ")[0] == match.split(" ")[0] 
                                        for og, match in zip(matching['Original Name'], matching['Matches'])]).apply(lambda x: int(x))
matching['Middle Word Dif'] = pd.Series([og.split(" ")[1:-1] == match.split(" ")[1:-1] 
                                         for og, match in zip(matching['Original Name'], matching['Matches'])]).apply(lambda x: int(x))
matching['Last Word Dif'] = pd.Series([og.split(" ")[-1] == match.split(" ")[-1] 
                                       for og, match in zip(matching['Original Name'], matching['Matches'])]).apply(lambda x: int(x))