### TheFuzz

### Preparation

In [19]:
!pip install thefuzz



In [18]:
# Prepare packages
from thefuzz import fuzz, process
from multiprocessing import Pool
import pandas as pd
import numpy as np
import re

In [19]:
# Import data
left_df = pd.read_csv('../data/left_dataset.csv')
right_df = pd.read_csv('../data/right_dataset.csv')

In [20]:
# Try to find null value to ensure the accuracy
print("Missing values in left_df:")
print(left_df.isnull().sum())

print("Missing values in right_df:")
print(right_df.isnull().sum())

Missing values in left_df:
entity_id         0
name              0
address        2798
city              0
state             0
postal_code      37
categories       62
dtype: int64
Missing values in right_df:
business_id    0
name           0
address        0
city           0
state          0
zip_code       0
size           0
dtype: int64


#### Data Cleaning

In [21]:
# Impute na in left_df
left_df['address'] = left_df['address'].fillna('')
left_df['postal_code'] = left_df['postal_code'].astype(str).fillna('00000')

# Verify that missing values have been addressed
left_df.isnull().sum()

entity_id       0
name            0
address         0
city            0
state           0
postal_code     0
categories     62
dtype: int64

In [22]:
# Deal with zip_code
left_df['postal_code'] = left_df['postal_code'].astype(str).apply(lambda x: x.split('.')[0])
right_df['zip_code'] = right_df['zip_code'].astype(str).apply(lambda x: x.split('-')[0].zfill(5))

In [23]:
# Deal with other features
def preprocess_data(df):
    df['state'] = df['state'].astype(str).str.strip()
    df['name'] = df['name'].astype(str).str.lower().str.strip().apply(lambda x: re.sub(r'[^\w\s]', '', x))
    df['address'] = df['address'].astype(str).str.lower().str.strip().apply(lambda x: re.sub(r'[^\w\s]', '', x))
    df['city'] = df['city'].astype(str).str.lower().str.strip().apply(lambda x: re.sub(r'[^\w\s]', '', x))
    return df

# Apply the preprocessing function to both dataframes
left_df = preprocess_data(left_df)
right_df = preprocess_data(right_df)

In [24]:
left_df.head(10)

Unnamed: 0,entity_id,name,address,city,state,postal_code,categories
0,1,the ups store,87 grasso plaza shopping center,affton,MO,63123,"Shipping Centers, Local Services, Notaries, Ma..."
1,2,st honore pastries,935 race st,philadelphia,PA,19107,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
2,3,perkiomen valley brewery,101 walnut st,green lane,PA,18054,"Brewpubs, Breweries, Food"
3,4,sonic drivein,615 s main st,ashland city,TN,37015,"Burgers, Fast Food, Sandwiches, Food, Ice Crea..."
4,5,famous footwear,8522 eager road dierbergs brentwood point,brentwood,MO,63144,"Sporting Goods, Fashion, Shoe Stores, Shopping..."
5,6,temple bethel,400 pasadena ave s,st petersburg,FL,33707,"Synagogues, Religious Organizations"
6,7,tsevis pub and grill,8025 mackenzie rd,affton,MO,63123,"Pubs, Restaurants, Italian, Bars, American (Tr..."
7,8,sonic drivein,2312 dickerson pike,nashville,TN,37207,"Ice Cream & Frozen Yogurt, Fast Food, Burgers,..."
8,9,marshalls,21705 village lakes sc dr,land o lakes,FL,34639,"Department Stores, Shopping, Fashion"
9,10,vietnamese food truck,,tampa bay,FL,33602,"Vietnamese, Food, Restaurants, Food Trucks"


In [25]:
right_df.head(10)

Unnamed: 0,business_id,name,address,city,state,zip_code,size
0,1,sourini painting inc,12800 44th st n,clearwater,FL,33762,11.0
1,2,wolff dolla bill llc,1905 e 19th ave,tampa,FL,33605,8.0
2,3,comprehensive surgery center llc,1988 gulf to bay blvd ste 1,clearwater,FL,33765,8.0
3,4,frank adam apparel llc,13640 wright cir,tampa,FL,33626,12.0
4,5,moreno plus transport inc,8608 huron court unite 58,tampa,FL,33614,8.0
5,6,ogster llc,140 danube ave apt 4,tampa,FL,33606,16.0
6,7,american collision center inc,11440 66th st,largo,FL,33773,13.0
7,8,maggie n davis dmd llc,3840 tampa rd,palm harbor,FL,34684,18.0
8,9,hvac email marketing llc,5002 arundell ct,tampa,FL,33624,20.0
9,10,signage of tampa inc,4400 140th ave n,clearwater,FL,33759,16.0


#### Using Package to Match

In [29]:
# Create combined fields
left_df['combined'] = left_df['name'] + ' ' + left_df['address'] + ' ' + left_df['state'] + ' ' + left_df['postal_code'].astype(str)
right_df['combined'] = right_df['name'] + ' ' + right_df['address'] + ' ' + right_df['state'] + ' ' + right_df['zip_code'].astype(str)

# Create blocking keys
left_df['block_key'] = left_df['name'].str[0] + left_df['state']
right_df['block_key'] = right_df['name'].str[0] + right_df['state']

# Create a dictionary for right_df to reduce computation
right_dict = right_df.groupby('block_key')['combined'].apply(list).to_dict()

# Matching function to find best matches above score of 80
def match_entries(left_entries, right_entries):
    matches = []
    for left_entry in left_entries:
        best_match = process.extractOne(left_entry, right_entries, scorer=process.fuzz.partial_ratio, score_cutoff=80)
        if best_match:
            matches.append((left_entry, best_match[0], best_match[1]))
    return matches

# Perform matching within blocks
results = []
for key in left_df['block_key'].unique():
    if key in right_dict:
        block_matches = match_entries(left_df[left_df['block_key'] == key]['combined'], right_dict[key])
        results.extend(block_matches)

# Print or process results
print(len(results), results[:5])

14932 [('tko djs 2650 s big bend MO 63143', 'tko djs inc 2650 s big bend blvd MO 63143', 81), ('tile source 2605 s hanley rd MO 63144', 'tile source inc 2605 s hanley rd MO 63144', 89), ('tip top nails 1485 saint louis galleria MO 63117', 'texas de brazil st louis corporation 1137 saint louis galleria MO 63117', 83), ('the selfie room 1424 washington ave ste 100 MO 63103', 'the selfie room llc 1424 washington avenue MO 63103', 86), ('twisted roots 3690 forest park ave MO 63108', 'twisted roots llc 3690 forest park ave MO 63108', 91)]


In [30]:
# Create a mapping from 'combined' to IDs
left_id_map = left_df.set_index('combined')['entity_id'].to_dict()
right_id_map = right_df.set_index('combined')['business_id'].to_dict()

# Create a list to hold our extracted data
match_data = []

# Extract entity_id, business_id, and confidence score
for left_combined, right_combined, score in results:
    entity_id = left_id_map.get(left_combined, None)
    business_id = right_id_map.get(right_combined, None)
    if entity_id and business_id:
        match_data.append({'left_dataset': entity_id, 'right_dataset': business_id, 'confidence_score': score/100})

# Convert list to DataFrame
matches_df = pd.DataFrame(match_data)

# Write to CSV
matches_df.to_csv('thefuzz_submission.csv', index=False)

In [31]:
matches_df

Unnamed: 0,left_dataset,right_dataset,confidence_score
0,60,39237,0.81
1,534,42420,0.89
2,1337,39545,0.83
3,2651,49857,0.86
4,3214,49285,0.91
...,...,...,...
14927,58215,88924,0.85
14928,16504,84508,0.84
14929,38004,84508,0.91
14930,72169,91487,0.89


In [32]:
import pandas as pd
from thefuzz import process, fuzz

def thefuzz_pipeline(left_df, right_df, output_csv):
    # Fill missing values
    left_df['address'] = left_df['address'].fillna('')
    left_df['postal_code'] = left_df['postal_code'].astype(str).fillna('00000')

    def preprocess_data(df):
        df['state'] = df['state'].astype(str).str.strip()
        df['name'] = df['name'].astype(str).str.lower().str.strip().apply(lambda x: re.sub(r'[^\w\s]', '', x))
        df['address'] = df['address'].astype(str).str.lower().str.strip().apply(lambda x: re.sub(r'[^\w\s]', '', x))
        df['city'] = df['city'].astype(str).str.lower().str.strip().apply(lambda x: re.sub(r'[^\w\s]', '', x))
        return df
    
    # Apply the preprocessing function to both dataframes
    left_df = preprocess_data(left_df)
    right_df = preprocess_data(right_df)

    #deal with postercode problem
    left_df['postal_code'] = left_df['postal_code'].astype(str).apply(lambda x: x.split('.')[0])
    right_df['zip_code'] = right_df['zip_code'].astype(str).apply(lambda x: x.split('-')[0].zfill(5))

    # Create combined fields
    left_df['combined'] = left_df['name'] + ' ' + left_df['address'] + ' ' + left_df['state'] + ' ' + left_df['postal_code'].astype(str)
    right_df['combined'] = right_df['name'] + ' ' + right_df['address'] + ' ' + right_df['state'] + ' ' + right_df['zip_code'].astype(str)

    # Create blocking keys
    left_df['block_key'] = left_df['name'].str[0] + left_df['state']
    right_df['block_key'] = right_df['name'].str[0] + right_df['state']

    # Create dictionaries for each block
    right_dict = right_df.groupby('block_key')['combined'].apply(list).to_dict()

   # Matching function to find best matches above score of 80
    def match_entries(left_entries, right_entries):
        matches = []
        for left_entry in left_entries:
            best_match = process.extractOne(left_entry, right_entries, scorer=process.fuzz.partial_ratio, score_cutoff=80)
            if best_match:
                matches.append((left_entry, best_match[0], best_match[1]))
        return matches

    # Perform matching within blocks
    results = []
    for key in left_df['block_key'].unique():
        if key in right_dict:
            block_matches = match_entries(left_df[left_df['block_key'] == key]['combined'], right_dict[key])
            results.extend(block_matches)

    # Map combined strings back to IDs
    left_id_map = left_df.set_index('combined')['entity_id'].to_dict()
    right_id_map = right_df.set_index('combined')['business_id'].to_dict()
    
    # Extract entity_id, business_id, and confidence score
    match_data = []
    for left_combined, right_combined, score in results:
        entity_id = left_id_map.get(left_combined, None)
        business_id = right_id_map.get(right_combined, None)
        if entity_id and business_id:
            match_data.append({'left_dataset': entity_id, 'right_dataset': business_id, 'confidence_score': score/100})

    # Convert to DataFrame
    thefuzz_submission = pd.DataFrame(match_data)
    
    # Write results to CSV
    thefuzz_submission.to_csv(output_csv, index=False)

    return thefuzz_submission

In [33]:
# Function test 
# thefuzz_pipeline(left_df, right_df, output_csv = "../data/thefuzz_submission.csv")

Unnamed: 0,left_dataset,right_dataset,confidence_score
0,60,39237,0.81
1,534,42420,0.89
2,1337,39545,0.83
3,2651,49857,0.86
4,3214,49285,0.91
...,...,...,...
14927,58215,88924,0.85
14928,16504,84508,0.84
14929,38004,84508,0.91
14930,72169,91487,0.89
