# Step 1: Data Cleaning

In [1]:
import pandas as pd
left_df = pd.read_csv('../data/left_dataset.csv')
right_df = pd.read_csv('../data/right_dataset.csv')

In [2]:
print("Missing values in left_df:")
print(left_df.isnull().sum())

print("Missing values in right_df:")
print(right_df.isnull().sum())

Missing values in left_df:
entity_id         0
name              0
address        2798
city              0
state             0
postal_code      37
categories       62
dtype: int64
Missing values in right_df:
business_id    0
name           0
address        0
city           0
state          0
zip_code       0
size           0
dtype: int64


In [3]:
#pip install skimpy

In [3]:
# Clean and Standardize Postal Codes

# Clean postal_code in left_df by converting to integer
left_df['postal_code'] = left_df['postal_code'].fillna(0).astype(int).astype(str)

# Clean postal_code in right_df by slicing to keep only the first 5 digits
right_df['zip_code'] = right_df['zip_code'].str.slice(0, 5)

In [4]:
# Convert all columns to string
left_df = left_df.astype(str)
right_df = right_df.astype(str)

In [5]:
def clean_dataframe(df):
    def clean_text_column(column):
        column = column.str.lower()  # Convert to lower case
        column = column.str.replace(r'[^\w\s]', '', regex=True)  # Remove punctuation
        column = column.str.strip()  # Strip whitespaces
        return column
    # Apply the cleaning function to each column in the DataFrame that is of object type
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = clean_text_column(df[column])
    return df

In [6]:
left_df = clean_dataframe(left_df)
right_df = clean_dataframe(right_df)

In [7]:
from skimpy import skim
skim(left_df)
skim(right_df)

In [8]:
left_df.head()

Unnamed: 0,entity_id,name,address,city,state,postal_code,categories
0,1,the ups store,87 grasso plaza shopping center,affton,mo,63123,shipping centers local services notaries mailb...
1,2,st honore pastries,935 race st,philadelphia,pa,19107,restaurants food bubble tea coffee tea bakeries
2,3,perkiomen valley brewery,101 walnut st,green lane,pa,18054,brewpubs breweries food
3,4,sonic drivein,615 s main st,ashland city,tn,37015,burgers fast food sandwiches food ice cream f...
4,5,famous footwear,8522 eager road dierbergs brentwood point,brentwood,mo,63144,sporting goods fashion shoe stores shopping sp...


In [9]:
right_df.head()

Unnamed: 0,business_id,name,address,city,state,zip_code,size
0,1,sourini painting inc,12800 44th st n,clearwater,fl,33762,110
1,2,wolff dolla bill llc,1905 e 19th ave,tampa,fl,33605,80
2,3,comprehensive surgery center llc,1988 gulf to bay blvd ste 1,clearwater,fl,33765,80
3,4,frank adam apparel llc,13640 wright cir,tampa,fl,33626,120
4,5,moreno plus transport inc,8608 huron court unite 58,tampa,fl,33614,80


# Step2: Blocking

## Create Combined Blocking Key

In [10]:
# Combine state, first letter of address, first letter of name, 
# first 3 letters of postal_code/zip_code to create a blocking key

left_df['block_key'] = (left_df['state'] + " " +
                        left_df['address'].str[0] + " " +  # First letter of address
                        left_df['name'].str[0] + " " +  # First letter of name
                        left_df['postal_code'].str[:3])  # First 3 letters of postal_code

right_df['block_key'] = (right_df['state'] + " " +
                         right_df['address'].str[0] + " " +  # First letter of address
                         right_df['name'].str[0] + " " +  # First letter of name
                         right_df['zip_code'].str[:3])  # First 3 letters of postal_code

In [11]:
left_df.head()

Unnamed: 0,entity_id,name,address,city,state,postal_code,categories,block_key
0,1,the ups store,87 grasso plaza shopping center,affton,mo,63123,shipping centers local services notaries mailb...,mo 8 t 631
1,2,st honore pastries,935 race st,philadelphia,pa,19107,restaurants food bubble tea coffee tea bakeries,pa 9 s 191
2,3,perkiomen valley brewery,101 walnut st,green lane,pa,18054,brewpubs breweries food,pa 1 p 180
3,4,sonic drivein,615 s main st,ashland city,tn,37015,burgers fast food sandwiches food ice cream f...,tn 6 s 370
4,5,famous footwear,8522 eager road dierbergs brentwood point,brentwood,mo,63144,sporting goods fashion shoe stores shopping sp...,mo 8 f 631


In [12]:
right_df.head()

Unnamed: 0,business_id,name,address,city,state,zip_code,size,block_key
0,1,sourini painting inc,12800 44th st n,clearwater,fl,33762,110,fl 1 s 337
1,2,wolff dolla bill llc,1905 e 19th ave,tampa,fl,33605,80,fl 1 w 336
2,3,comprehensive surgery center llc,1988 gulf to bay blvd ste 1,clearwater,fl,33765,80,fl 1 c 337
3,4,frank adam apparel llc,13640 wright cir,tampa,fl,33626,120,fl 1 f 336
4,5,moreno plus transport inc,8608 huron court unite 58,tampa,fl,33614,80,fl 8 m 336


## Reduce the Dataset Size by Blocking
Using the blocking key, merge the two datasets. This will significantly reduce the number of rows to compare.

In [13]:
# Perform an inner join on the blocking key
blocked_pairs = pd.merge(left_df, right_df, on='block_key', suffixes=('_left', '_right'))

In [14]:
blocked_pairs.head()

Unnamed: 0,entity_id,name_left,address_left,city_left,state_left,postal_code,categories,block_key,business_id,name_right,address_right,city_right,state_right,zip_code,size
0,1,the ups store,87 grasso plaza shopping center,affton,mo,63123,shipping centers local services notaries mailb...,mo 8 t 631,36646,tie consulting llc,8025 bonhomme ave 807,saint louis,mo,63105,30
1,1,the ups store,87 grasso plaza shopping center,affton,mo,63123,shipping centers local services notaries mailb...,mo 8 t 631,37027,the art of entertaining inc,8796 big bend blvd,saint louis,mo,63119,250
2,1,the ups store,87 grasso plaza shopping center,affton,mo,63123,shipping centers local services notaries mailb...,mo 8 t 631,37156,the kohn partnership llp,8251 maryland ave suite 108,saint louis,mo,63105,90
3,1,the ups store,87 grasso plaza shopping center,affton,mo,63123,shipping centers local services notaries mailb...,mo 8 t 631,37159,the schechter law firm pc,8000 maryland ave ste 950,saint louis,mo,63105,90
4,1,the ups store,87 grasso plaza shopping center,affton,mo,63123,shipping centers local services notaries mailb...,mo 8 t 631,37162,the schechter law firm pc,8000 maryland suite 950,st louis,mo,63105,90


# Step 3: Apply fnmatch for Matching and textdistance for Similarity Score


In [16]:
import fnmatch
import textdistance

# Define a function that uses both fnmatch and textdistance to calculate a confidence score
def calculate_confidence(name_left, name_right):
    # Check for exact match using fnmatch
    if name_left == name_right:
        return 1.0
    # If names are similar according to fnmatch
    elif fnmatch.fnmatch(name_left, f"*{name_right}*") or fnmatch.fnmatch(name_right, f"*{name_left}*"):
        # Use Jaro-Winkler similarity
        # It's a good choice after an fnmatch wildcard match because it can handle small discrepancies well.
        similarity = textdistance.jaro_winkler(name_left, name_right)
        # Boost the confidence score by a factor, since fnmatch has found a wildcard match
        # The reason for the boost is to differentiate between cases where fnmatch has already found 
        # some level of similarity (this case) and those where we're solely relying on the similarity score from textdistance.
        # The boost of 1.2 is a just a guess
        return min(similarity * 1.2, 1.0)
    else:
        # Uses Jaccard index for a similarity score if we did not find a wildcard match
        # It's a measure of how many elements two sets have in common.
        similarity = textdistance.jaccard(name_left, name_right)
        return similarity

# Apply the function to each row of the blocked_pairs dataframe
blocked_pairs['confidence_score'] = blocked_pairs.apply(
    lambda row: calculate_confidence(row['name_left'].lower(), row['name_right'].lower()), axis=1
)

# Filter out the matches with a confidence score above the threshold
high_confidence_matches = blocked_pairs[blocked_pairs['confidence_score'] > 0.8]
high_confidence_matches

Unnamed: 0,entity_id,name_left,address_left,city_left,state_left,postal_code,categories,block_key,business_id,name_right,address_right,city_right,state_right,zip_code,size,confidence_score
251,8,sonic drivein,2312 dickerson pike,nashville,tn,37207,ice cream frozen yogurt fast food burgers res...,tn 2 s 372,81912,sonic drivein nashville melrose llc,2501 franklin road,nashville,tn,37204,240,1.000
293,8,sonic drivein,2312 dickerson pike,nashville,tn,37207,ice cream frozen yogurt fast food burgers res...,tn 2 s 372,84021,sonic drivein nashville dickerson road llc,2312 dickerson road,nashville,tn,37207,210,1.000
950,15,bap,1224 south st,philadelphia,pa,19147,korean restaurants,pa 1 b 191,57146,bible way baptist church,1323 n 52nd street,philadelphia,pa,19131,110,0.885
3561,57,james dant,5624 e washington st,indianapolis,in,46219,fashion shopping mens clothing,in 5 j 462,32737,james dant llc,5624 e washington st,indianapolis,in,46219,60,1.000
3707,60,tko djs,2650 s big bend,maplewood,mo,63143,karaoke djs event planning services party ev...,mo 2 t 631,39159,tko djs inc,2650 s big bend blvd,st louis,mo,63143,250,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6837734,94475,rudys sports bar,11100 66th st,largo,fl,33773,restaurants pizza nightlife sports bars bars a...,fl 1 r 337,166,rudys sports bar llc,11100 66th st,largo,fl,33773,180,1.000
6837742,94475,rudys sports bar,11100 66th st,largo,fl,33773,restaurants pizza nightlife sports bars bars a...,fl 1 r 337,2373,rudys sports bar llc,11100 66th st,largo,fl,33773,180,1.000
6842155,94539,sage mediterranean,150 bridge st,phoenixville,pa,19460,restaurants mediterranean,pa 1 s 194,79363,sage mediterranean cuisine inc,150 bridge st,phoenixville,pa,19460,40,1.000
6842431,94546,plantation pub,8321 sawyer brown rd,bellevue,tn,37221,sports bars nightlife pubs bars,tn 8 p 372,83310,plantation pub inc,8321 sawyer brown rd,nashville,tn,37221,300,1.000


# Step 4: Write the Result to a CSV

In [25]:
high_confidence_matches = high_confidence_matches.rename(columns={'entity_id': 'left_dataset', 'business_id': 'right_dataset'})

In [26]:
# Prepare the final matches output with selected columns
output_columns = ['left_dataset', 'right_dataset', 'confidence_score']

In [28]:
high_confidence_matches[output_columns]

Unnamed: 0,left_dataset,right_dataset,confidence_score
251,8,81912,1.000
293,8,84021,1.000
950,15,57146,0.885
3561,57,32737,1.000
3707,60,39159,1.000
...,...,...,...
6837734,94475,166,1.000
6837742,94475,2373,1.000
6842155,94539,79363,1.000
6842431,94546,83310,1.000


In [22]:
high_confidence_matches[output_columns].to_csv('high_confidence_matches_fnmatch_textdistance.csv', index=False)