In [2]:
#! pip install recordlinkage

In [1]:
import pandas as pd
left_df = pd.read_csv('../data/left_dataset.csv')
right_df = pd.read_csv('../data/right_dataset.csv')

In [4]:
print("Missing values in left_df:")
print(left_df.isnull().sum())

print("Missing values in right_df:")
print(right_df.isnull().sum())

Missing values in left_df:
entity_id         0
name              0
address        2798
city              0
state             0
postal_code      37
categories       62
dtype: int64
Missing values in right_df:
business_id    0
name           0
address        0
city           0
state          0
zip_code       0
size           0
dtype: int64


In [5]:
from skimpy import skim
skim(left_df)

In [6]:
skim(right_df)

In [3]:
import pandas as pd
import recordlinkage
from recordlinkage.preprocessing import clean

In [4]:
left_df['name'] = clean(left_df['name'])
left_df['address'] = clean(left_df['address'])
left_df['city'] = clean(left_df['city'])
left_df['state'] = clean(left_df['state'])
right_df['name'] = clean(right_df['name'])
right_df['address'] = clean(right_df['address'])
right_df['city'] = clean(right_df['city'])
right_df['state'] = clean(right_df['state'])

In [5]:
# Handle postal codes as strings directly, stripping any non-numeric parts and padding if necessary
left_df['postal_code'] = left_df['postal_code'].astype(str).apply(lambda x: x.split('.')[0])

# First, ensure the zip_code column is treated as a string
right_df['zip_code'] = right_df['zip_code'].astype(str)

# Process zip_code to keep only the first part before the hyphen and ensure it's 5 digits
right_df['zip_code'] = right_df['zip_code'].apply(lambda x: x.split('-')[0].zfill(5))

left_df.fillna('unknown', inplace=True)

# Rename columns in right_df to match those in left_df
right_df.rename(columns={
    'zip_code': 'postal_code'  
}, inplace=True)

In [9]:
left_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,size
0,1,sourini painting inc,12800 44th st n,clearwater,fl,33762,11.0
1,2,wolff dolla bill llc,1905 e 19th ave,tampa,fl,33605,8.0
2,3,comprehensive surgery center llc,1988 gulf to bay blvd ste 1,clearwater,fl,33765,8.0
3,4,frank adam apparel llc,13640 wright cir,tampa,fl,33626,12.0
4,5,moreno plus transport inc,8608 huron court unite 58,tampa,fl,33614,8.0


In [10]:
right_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,size
0,1,sourini painting inc,12800 44th st n,clearwater,fl,33762,11.0
1,2,wolff dolla bill llc,1905 e 19th ave,tampa,fl,33605,8.0
2,3,comprehensive surgery center llc,1988 gulf to bay blvd ste 1,clearwater,fl,33765,8.0
3,4,frank adam apparel llc,13640 wright cir,tampa,fl,33626,12.0
4,5,moreno plus transport inc,8608 huron court unite 58,tampa,fl,33614,8.0


In [16]:
# Import the recordlinkage module
import recordlinkage

# Create an indexer object
indexer = recordlinkage.Index()

# Add a blocking step: only compare records with the same postal_code
indexer.block('postal_code')

# Generate pairs to compare
candidate_links = indexer.index(left_df, right_df)# Import the Compare module

In [18]:
# Import the Compare class from the recordlinkage module
from recordlinkage import Compare

# Create a compare object
compare_cl = Compare()

# Define the columns to compare and the method to use
compare_cl.string('name', 'name', method='jarowinkler', threshold=0.85)
compare_cl.string('address', 'address', method='jarowinkler', threshold=0.85)
compare_cl.string('city', 'city', method='jarowinkler', threshold=0.85)
compare_cl.string('state', 'state', method='jarowinkler', threshold=0.85)

# Compute the comparison vectors
features = compare_cl.compute(candidate_links, left_df, right_df)

In [20]:
features.head()

Unnamed: 0,Unnamed: 1,0,1,2,3
0,36550,0.0,0.0,0.0,1.0
0,36587,0.0,0.0,0.0,1.0
0,36604,0.0,0.0,0.0,1.0
0,36637,0.0,0.0,0.0,1.0
0,36638,0.0,0.0,0.0,1.0


In [21]:
# Compute the average of the comparison vectors
scores = features.mean(axis=1)

# Create a DataFrame for the matches and their scores
matches_df = pd.DataFrame({'left_index': features.index.get_level_values(0), 'right_index': features.index.get_level_values(1), 'score': scores})

# Filter the matches with a score of 0.8 or higher
high_confidence_matches = matches_df[matches_df['score'] >= 0.8]

# Merge the high-confidence matched records
result = pd.merge(high_confidence_matches, left_df, left_on='left_index', right_index=True)
result = pd.merge(result, right_df, left_on='right_index', right_index=True, suffixes=('_left', '_right'))

In [22]:
result.head()

Unnamed: 0,Unnamed: 1,left_index,right_index,score,entity_id,name_left,address_left,city_left,state_left,postal_code_left,categories,business_id,name_right,address_right,city_right,state_right,postal_code_right,size
7,82926,7,82926,1.0,8,sonic drive in,2312 dickerson pike,nashville,tn,37207,"Ice Cream & Frozen Yogurt, Fast Food, Burgers,...",82927,sonic drive in nashville dickerson road 2 llc,3904 dickerson pike,nashville,tn,37207,27.0
7,84020,7,84020,1.0,8,sonic drive in,2312 dickerson pike,nashville,tn,37207,"Ice Cream & Frozen Yogurt, Fast Food, Burgers,...",84021,sonic drive in nashville dickerson road llc,2312 dickerson road,nashville,tn,37207,21.0
56,32736,56,32736,1.0,57,james dant,5624 e washington st,indianapolis,in,46219,"Fashion, Shopping, Men's Clothing",32737,james dant llc,5624 e washington st,indianapolis,in,46219,6.0
66,77382,66,77382,1.0,67,its sold here,94 york rd,willow grove,pa,19090,"Shopping, Auction Houses, Active Life",77383,its sold here,94 york road,willow grove,pa,19090,3.0
76,28231,76,28231,1.0,77,cooks glass mirror,5703 w morris st,indianapolis,in,46241,"Home Services, Glass & Mirrors, Door Sales/Ins...",28232,cooks glass amp mirror inc,5703 w morris st,indianapolis,in,46241,10.0
