In [1]:
from fuzzyname import FuzzyName as Name
from fuzzywuzzy import fuzz
import itertools

In [112]:
def match_names(name1, name2):
    """
    Compares two names using fuzzy matching to determine if they could refer to the same entity.

    Parameters:
    name1 (str): The first name to compare. This could be a full name, partial name, or any string representing a name.
    name2 (str): The second name to compare, treated the same way as name1.

    Returns:
    bool: True if the names are deemed to match fuzzily, False otherwise.

    The function performs several checks:
    1. A preliminary fuzzy match using a specialized Name class.
    2. If the above fails, it sanitizes the names to remove non-alphanumeric characters, except spaces and apostrophes.
    3. It then performs a case-insensitive comparison of the sanitized names.
    4. If the direct comparison fails, it generates all non-repeating pairs of name parts for each name,
       ignoring single-character parts to avoid matching initials incorrectly.
    5. It then performs a fuzzy match on each combination of parts across the two names.
    6. If any combination matches, it returns True. If no combinations match, it returns False.
    """

    # Perform a preliminary fuzzy match check.
    preliminaryCheck = Name(name1) == Name(name2)
    
    # If the preliminary check passes, return True immediately.
    if preliminaryCheck:
        return True
    
    # Sanitize the names by removing punctuation and non-alphanumeric characters, except spaces and apostrophes.
    name1 = ''.join(char for char in name1 if char.isalnum() or char in [" ", "'"])
    name2 = ''.join(char for char in name2 if char.isalnum() or char in [" ", "'"])

    # Split the sanitized names into parts and convert them to lowercase for a case-insensitive comparison.
    name1_parts = name1.lower().split()
    name2_parts = name2.lower().split()

    # Compare the name parts directly; if they match, return True.
    if name1_parts == name2_parts:
        return True
    
    # Generate tuples of name parts for fuzzy comparison, excluding single-character parts.
    name1_combinations = [
        (a, b) for a in name1_parts for b in name1_parts 
        if a != b and len(a) > 1 and len(b) > 1
    ]
    name2_combinations = [
        (a, b) for a in name2_parts for b in name2_parts 
        if a != b and len(a) > 1 and len(b) > 1
    ]

    # Compare each combination of name parts from both names using the Name class's fuzzy matching.
    for comb1 in name1_combinations:
        for comb2 in name2_combinations:
            name_1 = ' '.join(comb1)
            name_2 = ' '.join(comb2)
            if Name(name_1) == Name(name_2):
                return True

    # If no combinations match, return False.
    return False


In [113]:
def get_name_combinations(name, length, ignore_single_characters=True):
    #     print(name, length)
    # Break the name into individual words and remove single character words
    if ignore_single_characters:
        name_parts = [part for part in name.lower().split() if len(part.replace('.', '')) > 1]
    else:
        name_parts = name.lower().split()

    # Create combinations of the given length
    return list(itertools.combinations(name_parts, length))


def match_names_old(name1, name2, ratio=80):
    # Get permutations for both names

    name1_parts = name1.lower().split()
    name2_parts = name2.lower().split()

    # If both names have only one part, check for an exact match
    if len(name1_parts) == 1 and len(name2_parts) == 1:
        return name1_parts[0] == name2_parts[0]

    # If one of the names has two parts with one of them having only one character,
    # check if it exactly matches one of the tuples of the other name
    if len(name1_parts) == 2 and any(len(part) == 1 for part in name1_parts):
        return any(
            set(name1_parts) == set(perm) for perm in get_name_combinations(name2, 2, ignore_single_characters=False))
    if len(name2_parts) == 2 and any(len(part) == 1 for part in name2_parts):
        return any(
            set(name2_parts) == set(perm) for perm in get_name_combinations(name1, 2, ignore_single_characters=False))

    name1_combinations = get_name_combinations(name1, 2)
    name2_combinations = get_name_combinations(name2, 2)

    #     print(name1_combinations, name2_combinations)
    # Loop through each permutation pair and apply fuzzy matching
    for perm1 in name1_combinations:
        for perm2 in name2_combinations:
            name_1 = ' '.join(perm1)
            name_2 = ' '.join(perm2)
            r = fuzz.token_set_ratio(name_1, name_2)
            #             print(perm1, perm2, name_1, name_2, r)
            # We join the tuples back into strings for comparison using token_set_ratio
            if r >= ratio:
                # If any pair of permutations has a high match ratio, consider the names a match
                return True

    # If no pairs of permutations have a high match ratio, the names don't match
    return False

In [114]:
import re
from unidecode import unidecode
# this is just an experiment. my names in data is actually structured starting with first names first for all, shareholders, officers and founders
class CustomName(Name):
    def __init__(self, name, exact=False, use_unidecode=True):
        # Call the constructor of the parent class (Name)
        super().__init__(name, exact, use_unidecode)
        # self._compare_names = super().__ne__

    def _reorder_name(self, name):
        # Heuristic to reorder name parts, e.g., "lastname middlename firstname" -> "firstname middlename lastname"
        parts = name.split()
        # This is a simple heuristic and may need to be more sophisticated
        if len(parts) <= 2:  # Likely that there is a middle name
            return parts[-1] + ' ' + ' '.join(parts[:-1])
        else:
            return ' '.join(reversed(parts))

    def __ne_new__(self, name1, name2):
        # print("compare names", self.name, name1, name2)
        # Use the comparison logic from the original Name class (__ne__ method)
        return super().__ne__(name2)
        

    def __ne__(self, other):
        
        # First, try comparing the names as they are
        ne_new = self.__ne_new__(self, other)
        
        if self.__ne_new__(self, other) == True:
            # If the names don't match, try reordering and comparing again
            
            reordered_self = self._reorder_name(self.normalized_name)
            reordered_other = self._reorder_name(other.normalized_name)
            reordered_self_name = CustomName(reordered_self, self.exact, use_unidecode=False)
            reordered_other_name = CustomName(reordered_other, other.exact, use_unidecode=False)
            print('additional_check', reordered_self, reordered_other)
            ne = self.__ne_new__(reordered_self_name, reordered_other_name)
            return ne 

        else:
            return False


In [115]:
CustomName('Herbert Middle John Aspen Herry')==CustomName('Herry Middle Herbert')

additional_check herry aspen john middle herbert herbert middle herry


True

In [116]:

name_pairs = [
    ('christopher john gregory wade', 'christopher mairs'),
    ('christopher john gregory wade', 'christopher wade'),
    ('christopher john wade', 'wade john christopher'),
    ('J. Smith', 'John Smith'),
    ('John A. Smith', 'John Smith'),
    ('Dr. John Smith', 'John Smith Jr.'),
    ('Di Angelo', 'DiAngelo'),
    ('Liz Smith', 'Elizabeth Smith'),
    ('Anne Smith-Jones', 'Anne Smith Jones'),
    ("D'Angelo", 'Dangelo'),
    ('Smith John', 'John Smith'),
    ('José', 'Jose'),
    ('Marybeth', 'Mary Beth'),
    ('Steven', 'Stephen'),
    ('John A.', 'John A'),
    ('De Marco', 'DeMarco'),
    ('Jonathan Smith IV', 'Jon Smith'),
    ('Geoff', 'Jeff'),
    ('Mikhail', 'Michael'),
    ('Rob Wickham', 'Robert Wickham'),
    ('Rob Wickham', 'Robert Wickham Junior'),
    ('Catherine Smith', 'Kathryn Smith'),
    ('Peggy', 'Margaret'),
    ('Sanchez-Cortez', 'Sanchez Cortez'),
    ('Garcia Marquez', 'Marquez Garcia'),
    # Possible false positives
    ('Anna Smith', 'Ann Smith'),
    ('Anna Smith', 'Anna Nicole Smith'),
    ('Bob Johnson', 'Rob Johnson'),
    ('Katy Brown', 'Katie Brown'),
    ('Alexandra Stevens', 'Alex Stevens'),
    ('Chris P. Bacon', 'Christopher Bacon'),
    ('Patricia de Lisle', 'Patrice Delisle'),
    ('Tony Shalhoub', 'Anthony Shalhoub'),
    ('Bill Roberts', 'Will Robertson'),
    ('Ted Brown', 'Edward Brown'),
    ('Maggie O’Neil', 'Margaret O’Neal'),
    ('Samuel Peters', 'Samantha Peterson'),
    ('Don Matthews', 'Daniel Matthes'),
    ('Harry T.', 'Harriet'),
    ('Sue Y.', 'Susie'),
    ('Jimmy Neutron', 'James Neutron'),
    ('Vickie', 'Victoria'),
    ('Nate', 'Nathan'),
    ('George DeGiacomo', 'DeGiacomo George'),
    ('Dave', 'David'),
    ('Laurie', 'Laurence'),
    ('Christina', 'Kristina'),
    ('Jackie Chen', 'Jack Chen'),
    ('Anna Lee', 'Annalie'),
    ('Nick Johnson', 'Nico Johnsson'),
    ('Isabel Marant', 'Isabella Martin'),
    ('Eddy Edwards', 'Eddie Edwardo'),
    ('Mick P. Mickelson', 'Michael Mickelson'),
    ('Sara Connor', 'Sarah O’Connor'),
    ('Tomás Hernández', 'Tom Hernandes'),
    ('Allan Fisher', 'Allen Fischer'),
    ('Donnie Brasco', 'Danny Brasco'),
    ('Lindsey Lohan', 'Lindsay Loan'),
    ('Rachel Green', 'Rachelle Greene'),
    ('Kristopher Kris', 'Christopher Criss'),
    ('Eva Mendes', 'Ava Mendez'),
    ('Ian O’Shaughnessy', 'Ean O’Shaughnessey'),
    ('Clare Sinclair', 'Claire St. Clair'),
    ('Greggory Peck', 'Gregory Pek'),
    ('Mitchell Mitch', 'Michael Mitch'),
    ('Ricky Martin', 'Rico Martín'),
    ('Mathew Lewis', 'Matthew Louis'),
    ('Helen Hunt', 'Helene Hunter'),
    ('Sidney Poitier', 'Sydney Potier'),
    ('Tonya Harding', 'Tonia Hardin'),
    ('Jim Beam', 'Tim Bean'),
    ('Sandra Bullok', 'Sondra Bullock'),
    ('Vince Vaughn', 'Vincent Van'),
    ('Colin Farrel', 'Collin Farrell'),
    ('Terry Crews', 'Terri Cruise'),
    ('Leo Tolstoy', 'Leon Tolstoi'),
    ('Kiera Knightly', 'Keira Knightley'),
    ('Jay Z', 'J Zee'),
    ('Steven Spielberg', 'Stephen Speelberg'),
    ('Al Pacino', 'Al Pachino'),
    ('Keanu Reaves', 'Keenu Reeves'),
    ('Sigourney Weaver', 'Sigurney Weever')
]


# Example of how you might iterate over the list
for a, b in name_pairs:
    # print("names: ", "a:", a, "b:", b)
    # # matched = match_names(a, b)
    # # print(matched)  #
    # matched = match_names_new(a, b)
    # print(matched)  #
    print("{:<30} {:<30} {:<30}".format(
        a, b, str(match_names(a, b))
    ))
print("finish")

christopher john gregory wade  christopher mairs              False                         
christopher john gregory wade  christopher wade               True                          
christopher john wade          wade john christopher          True                          
J. Smith                       John Smith                     True                          
John A. Smith                  John Smith                     True                          
Dr. John Smith                 John Smith Jr.                 True                          
Di Angelo                      DiAngelo                       False                         
Liz Smith                      Elizabeth Smith                True                          
Anne Smith-Jones               Anne Smith Jones               True                          
D'Angelo                       Dangelo                        True                          
Smith John                     John Smith                     True    

In [117]:
match_names("Herbert Herry", "Herry Herbert")

True