## Comparing strings

### The cutoff point

In [10]:
# Import process from fuzzywuzzy
import pandas as pd
from fuzzywuzzy import process

In [15]:
# Read dataset
restaurants = pd.read_csv("./dataset/restaurants_L2_dirty.csv")
restaurants_new = pd.read_csv("./dataset/restaurants_L2.csv")

In [16]:
# Store the unique values of type in unique_types
unique_types = restaurants['type'].unique()

# Calculate similarity of 'asian' to all values of type
print(process.extract('asian', unique_types, limit=len(unique_types)))

# Calculate similarity of 'american' to all values of type
print(process.extract('american', unique_types, limit=len(unique_types)))

# Calculate similarity of 'italian' to all values of type
print(process.extract('asian', unique_types, limit=len(unique_types)))

[('asian', 100), ('indonesian', 72), ('italian', 67), ('russian', 67), ('american', 62), ('californian', 54), ('japanese', 54), ('mexican/tex-mex', 54), ('american ( new )', 54), ('mexican', 50), ('cajun/creole', 36), ('middle eastern', 36), ('vietnamese', 36), ('pacific new wave', 36), ('fast food', 36), ('chicken', 33), ('hamburgers', 27), ('hot dogs', 26), ('coffeebar', 26), ('continental', 26), ('steakhouses', 25), ('southern/soul', 22), ('delis', 20), ('eclectic', 20), ('pizza', 20), ('health food', 19), ('diners', 18), ('coffee shops', 18), ('noodle shops', 18), ('french ( new )', 18), ('desserts', 18), ('seafood', 17), ('chinese', 17)]
[('american', 100), ('american ( new )', 90), ('mexican', 80), ('mexican/tex-mex', 68), ('asian', 62), ('italian', 53), ('russian', 53), ('middle eastern', 51), ('pacific new wave', 45), ('hamburgers', 44), ('indonesian', 44), ('chicken', 40), ('southern/soul', 39), ('japanese', 38), ('eclectic', 38), ('delis', 36), ('pizza', 36), ('cajun/creole',

### Remapping categories II

In [17]:
# For each correct type in categories
for cuisine in restaurants_new['type']:
    # Find matches in type of restaurants
    matches = process.extract(cuisine, restaurants['type'], limit=len(restaurants['type']))
    
    # For each possible_match with similarity score >= 80
    for possible_match in matches:
        if possible_match[1] >= 80:
            # Find matching cuisine type
            matching_cuisine = restaurants['type'] == possible_match[0]
            restaurants.loc[matching_cuisine, 'type'] = cuisine
            
# Print unique values to confirm mapping
print(restaurants['type'].unique())

['american' 'californian' 'japanese' 'cajun' 'hot dogs' 'diners' 'delis'
 'hamburgers' 'seafood' 'italian' 'coffee shops' 'russian' 'steakhouses'
 'noodle shops' 'middle eastern' 'asian' 'vietnamese' 'health food'
 'pacific new wave' 'indonesian' 'eclectic' 'chicken' 'fast food'
 'southern' 'coffeebar' 'continental' 'french ( new )' 'desserts'
 'chinese' 'pizza']


## Generating pairs

### Pairs of restaurants

In [19]:
import recordlinkage

In [22]:
# Create an indexer and object and find possible pairs
indexer = recordlinkage.Index()

# Block pairing on type
indexer.block('type')

# Generate pairs
pairs = indexer.index(restaurants, restaurants_new)

### Similar restaurants

In [25]:
# Create a comparison object
comp_cl = recordlinkage.Compare()

In [None]:
# Find exact matches on city, type
comp_cl.exact('city', 'city', label='city')
comp_cl.exact('type', 'type', label='type')