# About

This notebook is to sketch cleaning city names. To be deleted after related functions are incorporated into `preprocess.py` .

In [43]:
#pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import json
import pandas as pd
from fuzzywuzzy import process



# Load restaurant data in the top 5 city

In [2]:
restaurant_top5 = pd.read_json('restaurants_top5.json')

In [3]:
restaurant_top5.shape

(15478, 14)

In [4]:
restaurant_top5.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,PORTLAND,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ..."
13,ufCxltuh56FF4-ZFZ6cVhg,Sister Honey's,247 E Michigan St,ORLANDO,FL,32806,28.513265,-81.374707,4.5,135,1,"{'BusinessParking': '{'garage': False, 'street...","Restaurants, American (New), Bakeries, Dessert...","{'Tuesday': '11:0-18:0', 'Wednesday': '11:0-18..."
29,jGennaZUr2MsJyRhijNBfA,Legal Sea Foods,1 Harborside Dr,BOSTON,MA,2128,42.363442,-71.025781,3.5,856,1,"{'NoiseLevel': 'u'average'', 'BikeParking': 'F...","Sandwiches, Food, Restaurants, Breakfast & Bru...","{'Monday': '6:0-21:0', 'Tuesday': '6:0-21:0', ..."
35,iPD8BBvea6YldQZPHzVrSQ,Espresso Minute,334 Mass Ave,BOSTON,MA,2115,42.342673,-71.084239,4.5,7,0,"{'NoiseLevel': ''quiet'', 'GoodForKids': 'True...","Creperies, Restaurants, Food, Coffee & Tea, Br...","{'Tuesday': '8:0-20:0', 'Wednesday': '8:0-20:0..."
36,jx91IMdGOmLOo8h_F9z39g,Cleary's Restaurant & Spirits,12429 NE Glisan St,PORTLAND,OR,97230,45.526473,-122.535323,3.5,19,1,"{'RestaurantsGoodForGroups': 'True', 'Alcohol'...","Nightlife, Sandwiches, Seafood, Restaurants","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ..."


# Check  and clean *states*

In [5]:
states_all_sorted = restaurant_top5.groupby("state", as_index = False).agg({'business_id': 'count'}).sort_values(by = "business_id", ascending = False)
states_all_sorted

Unnamed: 0,state,business_id
4,OR,3759
1,GA,3319
5,TX,3200
0,FL,2787
2,MA,2411
3,MN,1
6,WA,1


In [6]:
states_top5 = list(states_all_sorted.iloc[0:5,0])
states_top5

['OR', 'GA', 'TX', 'FL', 'MA']

In [7]:
# clean up states
restaurant_clean = restaurant_top5[restaurant_top5['state'].isin(states_top5)]

In [8]:
restaurant_clean.shape

(15476, 14)

In [37]:
# see the outliers
restaurant_top5[restaurant_top5['state'].isin(['MN', 'WA'])]

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
84075,HorLV50SgSj2lArWWu4lEA,Caribou Coffee,1001 18th Ave NW,AUSTIN,MN,55912,30.20213,-97.668832,3.5,6,1,"{'RestaurantsPriceRange2': '1', 'BusinessParki...","Food, Coffee & Tea, Bakeries, Cafes, Restaurants","{'Monday': '5:30-20:0', 'Tuesday': '5:30-20:0'..."
113980,6PLFPRB3Tj560oAJLq5TKg,IJ SUSHI BURRITO,"9585 SW Washington Square Rd, Space FC08",PORTLAND,WA,97233,45.450352,-122.780373,4.5,9,1,"{'Caters': 'True', 'RestaurantsDelivery': 'Tru...","Sushi Bars, Restaurants, Hawaiian, Asian Fusio...","{'Monday': '11:0-19:0', 'Tuesday': '11:0-19:0'..."


# Collapsing city name

In [36]:
restaurant_clean['city'].value_counts()

PORTLAND                     3757
ATLANTA                      3318
AUSTIN                       3200
ORLANDO                      2786
BOSTON                       2365
EAST BOSTON                    30
SOUTH BOSTON                   14
ATLANTA  (BKHD)                 1
ALTAMONTE SPRINGS ORLANDO       1
PORTLAND-GATEWAY PLAZA          1
BOSTON-FENWAY                   1
S BOSTON                        1
PORTLAND-EASTPORT PLAZA         1
Name: city, dtype: int64

## Approach 1: fuzzywuzzy

### 01. Determine cut-off value

In [10]:
# store unique values of city names
city_values = restaurant_clean['city'].unique()
print(city_values)

['PORTLAND' 'ORLANDO' 'BOSTON' 'AUSTIN' 'ATLANTA' 'EAST BOSTON'
 'ATLANTA  (BKHD)' 'SOUTH BOSTON' 'ALTAMONTE SPRINGS ORLANDO'
 'PORTLAND-GATEWAY PLAZA' 'BOSTON-FENWAY' 'S BOSTON'
 'PORTLAND-EASTPORT PLAZA']


In [11]:
# 1 - calculate similarities to top city
top_city = 'PORTLAND'
print(process.extract(top_city, city_values, limit = len(city_values)))

[('PORTLAND', 100), ('PORTLAND-GATEWAY PLAZA', 90), ('PORTLAND-EASTPORT PLAZA', 90), ('ORLANDO', 80), ('ALTAMONTE SPRINGS ORLANDO', 68), ('ATLANTA', 53), ('ATLANTA  (BKHD)', 45), ('BOSTON', 43), ('S BOSTON', 38), ('SOUTH BOSTON', 36), ('BOSTON-FENWAY', 34), ('EAST BOSTON', 32), ('AUSTIN', 29)]


In [12]:
# 2
top_city = 'ATLANTA'
print(process.extract(top_city, city_values, limit = len(city_values)))

[('ATLANTA', 100), ('ATLANTA  (BKHD)', 90), ('PORTLAND', 53), ('PORTLAND-GATEWAY PLAZA', 53), ('PORTLAND-EASTPORT PLAZA', 53), ('ALTAMONTE SPRINGS ORLANDO', 51), ('AUSTIN', 46), ('ORLANDO', 43), ('EAST BOSTON', 34), ('BOSTON', 31), ('SOUTH BOSTON', 30), ('S BOSTON', 27), ('BOSTON-FENWAY', 20)]


In [13]:
# 3
top_city = 'AUSTIN'
print(process.extract(top_city, city_values, limit = len(city_values)))

[('AUSTIN', 100), ('BOSTON', 50), ('EAST BOSTON', 47), ('ATLANTA', 46), ('ATLANTA  (BKHD)', 45), ('SOUTH BOSTON', 45), ('ALTAMONTE SPRINGS ORLANDO', 45), ('BOSTON-FENWAY', 45), ('PORTLAND-EASTPORT PLAZA', 45), ('S BOSTON', 43), ('ORLANDO', 31), ('PORTLAND-GATEWAY PLAZA', 30), ('PORTLAND', 29)]


In [14]:
# 4
top_city = 'ORLANDO'
print(process.extract(top_city, city_values, limit = len(city_values)))

[('ORLANDO', 100), ('ALTAMONTE SPRINGS ORLANDO', 90), ('PORTLAND', 80), ('PORTLAND-GATEWAY PLAZA', 77), ('PORTLAND-EASTPORT PLAZA', 77), ('ATLANTA', 43), ('ATLANTA  (BKHD)', 39), ('SOUTH BOSTON', 37), ('BOSTON', 31), ('AUSTIN', 31), ('EAST BOSTON', 30), ('S BOSTON', 27), ('BOSTON-FENWAY', 26)]


In [15]:
# 5
top_city = 'BOSTON'
print(process.extract(top_city, city_values, limit = len(city_values)))

[('BOSTON', 100), ('S BOSTON', 95), ('EAST BOSTON', 90), ('SOUTH BOSTON', 90), ('BOSTON-FENWAY', 90), ('AUSTIN', 50), ('ALTAMONTE SPRINGS ORLANDO', 45), ('PORTLAND-GATEWAY PLAZA', 45), ('PORTLAND-EASTPORT PLAZA', 45), ('PORTLAND', 43), ('ORLANDO', 31), ('ATLANTA', 31), ('ATLANTA  (BKHD)', 18)]


the **cut-off** for similarity score seem to be > 80, for example cut-off = 85

### 02. Iterate

In [None]:
# Create list of top 5 city

In [23]:
city_list = list(restaurant_clean.groupby(["state", "city"], as_index = False) 
                 .agg({'business_id': 'count'}).sort_values(by = 'business_id', ascending = False).head(5)['city'])
city_list

['PORTLAND', 'ATLANTA', 'AUSTIN', 'ORLANDO', 'BOSTON']

In [24]:
# Iterate through the city_list
for city_name in city_list:  
  # Create a list of matches, comparing city_name with the 'city' column
  matches = process.extract(city_name, restaurant_clean['city'], limit=len(restaurant_clean.city))
    
  # Iterate through the list of matches
  for match in matches:
     # Check whether the similarity score is greater than or equal to 85
    if match[1] >= 85:
      # If it is, select all rows where the city is spelled this way, and set them to the correct city_name
      restaurant_clean.loc[restaurant_clean['city'] == match[0]] = city_name

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [25]:
# confirm remapping
restaurant_clean['city'].value_counts()

PORTLAND    3759
ATLANTA     3319
AUSTIN      3200
ORLANDO     2787
BOSTON      2411
Name: city, dtype: int64

## Approach 2: mapping

In [38]:
mapping = {'PORTLAND': 'PORTLAND',
           'ATLANTA': 'ATLANTA',
           'AUSTIN' : 'AUSTIN',
           'ORLANDO' : 'ORLANDO',
           'BOSTON' : 'BOSTON',
           'EAST BOSTON': 'BOSTON',
           'SOUTH BOSTON': 'BOSTON',
           'ATLANTA  (BKHD)' : 'ATLANTA',
           'ALTAMONTE SPRINGS ORLANDO': 'ORLANDO',
           'PORTLAND-GATEWAY PLAZA' : 'PORTLAND',
           'BOSTON-FENWAY' : 'BOSTON',
           'S BOSTON' : 'BOSTON',
           'PORTLAND-EASTPORT PLAZA' : 'PORTLAND'
          }

In [40]:
restaurant_clean['city'] = restaurant_clean['city'].replace(mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [41]:
restaurant_clean['city'].unique()

array(['PORTLAND', 'ORLANDO', 'BOSTON', 'AUSTIN', 'ATLANTA'], dtype=object)

In [42]:
restaurant_clean['city'].value_counts()

PORTLAND    3759
ATLANTA     3319
AUSTIN      3200
ORLANDO     2787
BOSTON      2411
Name: city, dtype: int64