## Preparation

In [1]:
# Using SQL + Pandas
import pandas as pd
import numpy as np

# Data Visualization
from plotnine import *
import seaborn as sns
import matplotlib.pyplot as plt

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Clean PH_DC

In [2]:
# Read in the data
PH_DC = pd.read_csv("Data/ParseHub/PH_DC.csv")

In [3]:
# View the head of the data
PH_DC.head()

Unnamed: 0,business_name,business_url,business_phone,business_address,business_rating,business_review_count,business_price_category,business_neighborhood,business_index
0,Nespresso Boutique,https://www.yelp.com/adredir?ad_business_id=YT...,(800) 562-1465,100 S Hayes St,4.0,26,"Coffee & Tea, Coffee Roasteries, Cafes",,
1,Starbucks,https://www.yelp.com/adredir?ad_business_id=Nv...,-5139,2225 Georgia Ave NW,3.0,24,"Breakfast & Brunch, Coffee & Tea",,
2,A Baked Joint,https://www.yelp.com/biz/a-baked-joint-washing...,(202) 408-6985,430 K St NW,4.5,1918,"$Coffee & Tea, Breakfast & Brunch, Sandwiches",,1.0
3,Zaytinya,https://www.yelp.com/biz/zaytinya-washington,(202) 638-0800,701 9th St NW,4.0,4687,"$$$Greek, Turkish, Lebanese",,2.0
4,Le Diplomate,https://www.yelp.com/biz/le-diplomate-washington,(202) 332-3333,1601 14th St NW,4.5,3607,"$$$Brasseries, French, Breakfast & Brunch",Logan Circle,3.0


In [4]:
# Print the shape of the data
PH_DC.shape

(6352, 9)

In [5]:
colnames = ['name','url','phone','address','rating','review_count','price_category','neighborhood','index']
PH_DC.columns = colnames
PH_DC.head()

Unnamed: 0,name,url,phone,address,rating,review_count,price_category,neighborhood,index
0,Nespresso Boutique,https://www.yelp.com/adredir?ad_business_id=YT...,(800) 562-1465,100 S Hayes St,4.0,26,"Coffee & Tea, Coffee Roasteries, Cafes",,
1,Starbucks,https://www.yelp.com/adredir?ad_business_id=Nv...,-5139,2225 Georgia Ave NW,3.0,24,"Breakfast & Brunch, Coffee & Tea",,
2,A Baked Joint,https://www.yelp.com/biz/a-baked-joint-washing...,(202) 408-6985,430 K St NW,4.5,1918,"$Coffee & Tea, Breakfast & Brunch, Sandwiches",,1.0
3,Zaytinya,https://www.yelp.com/biz/zaytinya-washington,(202) 638-0800,701 9th St NW,4.0,4687,"$$$Greek, Turkish, Lebanese",,2.0
4,Le Diplomate,https://www.yelp.com/biz/le-diplomate-washington,(202) 332-3333,1601 14th St NW,4.5,3607,"$$$Brasseries, French, Breakfast & Brunch",Logan Circle,3.0


In [6]:
# Drop sponsored businesses
PH_DC.dropna(subset=['index'],inplace=True)
PH_DC = PH_DC.drop('index',axis=1)
PH_DC.shape

(4854, 8)

In [7]:
# Drop duplicated businesses
PH_DC.drop_duplicates(subset='name',keep="first",inplace=True)
PH_DC.shape

(1753, 8)

In [8]:
# Split price_category into two columns
PH_DC[['price_range','category']] = PH_DC.price_category.str.rsplit("$",1,expand=True)
PH_DC = PH_DC.drop('price_category',axis=1)
PH_DC.head()

Unnamed: 0,name,url,phone,address,rating,review_count,neighborhood,price_range,category
2,A Baked Joint,https://www.yelp.com/biz/a-baked-joint-washing...,(202) 408-6985,430 K St NW,4.5,1918,,,"Coffee & Tea, Breakfast & Brunch, Sandwiches"
3,Zaytinya,https://www.yelp.com/biz/zaytinya-washington,(202) 638-0800,701 9th St NW,4.0,4687,,$$,"Greek, Turkish, Lebanese"
4,Le Diplomate,https://www.yelp.com/biz/le-diplomate-washington,(202) 332-3333,1601 14th St NW,4.5,3607,Logan Circle,$$,"Brasseries, French, Breakfast & Brunch"
5,Unconventional Diner,https://www.yelp.com/biz/unconventional-diner-...,(202) 847-0122,1207 9th St NW,4.5,1166,Shaw,$,"American (New), Breakfast & Brunch"
6,Old Ebbitt Grill,https://www.yelp.com/biz/old-ebbitt-grill-wash...,(202) 347-4800,675 15th St NW,4.0,8513,,$,"Bars, American (Traditional), Breakfast & Brunch"


In [9]:
# Save the indices of NaN in category
swap = PH_DC.index[PH_DC['category'].isna()]

# Fill NaN in category with price_range
PH_DC['category'] = PH_DC['category'].fillna(PH_DC['price_range'])

# Add back one $ to price_range
PH_DC['price_range'] = PH_DC['price_range']+'$'

# Replace missing values with NaN
PH_DC['price_range'].loc[swap] = np.nan

# Convert category into list of strings
PH_DC['category'] = PH_DC.category.str.split(", ",expand=False)

PH_DC.head()

Unnamed: 0,name,url,phone,address,rating,review_count,neighborhood,price_range,category
2,A Baked Joint,https://www.yelp.com/biz/a-baked-joint-washing...,(202) 408-6985,430 K St NW,4.5,1918,,$,"[Coffee & Tea, Breakfast & Brunch, Sandwiches]"
3,Zaytinya,https://www.yelp.com/biz/zaytinya-washington,(202) 638-0800,701 9th St NW,4.0,4687,,$$$,"[Greek, Turkish, Lebanese]"
4,Le Diplomate,https://www.yelp.com/biz/le-diplomate-washington,(202) 332-3333,1601 14th St NW,4.5,3607,Logan Circle,$$$,"[Brasseries, French, Breakfast & Brunch]"
5,Unconventional Diner,https://www.yelp.com/biz/unconventional-diner-...,(202) 847-0122,1207 9th St NW,4.5,1166,Shaw,$$,"[American (New), Breakfast & Brunch]"
6,Old Ebbitt Grill,https://www.yelp.com/biz/old-ebbitt-grill-wash...,(202) 347-4800,675 15th St NW,4.0,8513,,$$,"[Bars, American (Traditional), Breakfast & Bru..."


In [10]:
PH_DC.dtypes

name             object
url              object
phone            object
address          object
rating          float64
review_count     object
neighborhood     object
price_range      object
category         object
dtype: object

In [11]:
# Convert review_count to int
PH_DC['review_count'] =  PH_DC['review_count'].apply(pd.to_numeric,errors='coerce')
PH_DC.dropna(subset=['review_count'],inplace=True)
PH_DC['review_count'] =  PH_DC['review_count'].astype(int)

In [12]:
# Convert price_range to float
PH_DC['price_range'] = PH_DC.price_range.replace('$$$$',4)
PH_DC['price_range'] = PH_DC.price_range.replace('$$$',3)
PH_DC['price_range'] = PH_DC.price_range.replace('$$',2)
PH_DC['price_range'] = PH_DC.price_range.replace('$',1)

In [13]:
PH_DC['name'] = PH_DC.name.str.replace('’',"'")
PH_DC['name'] = PH_DC.name.str.replace(' - Temp. CLOSED',"")
PH_DC['region'] = 'Washington DC'

In [14]:
PH_DC.dtypes

name             object
url              object
phone            object
address          object
rating          float64
review_count      int64
neighborhood     object
price_range     float64
category         object
region           object
dtype: object

In [15]:
PH_DC.shape

(1651, 10)

## Merge PH_DC and DC_Wiki

In [16]:
# Read in the data
DC_Wiki = pd.read_csv("Data/Wikipedia/Wiki_DC.csv")
DC_Wiki.head()

Unnamed: 0,Name,Neighborhood/City,2017,2018,2019,2020
0,Blue Duck Tavern,West End,1 Michelin star,1 Michelin star,1 Michelin star,
1,Bresca,Logan Circle,,,1 Michelin star,1 Michelin star
2,Fiola,Penn Quarter,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star
3,Gravitas,Ivy City,,,,1 Michelin star
4,Inn at Little Washington,"Washington, VA",2 Michelin stars,2 Michelin stars,3 Michelin stars,3 Michelin stars


In [17]:
list(DC_Wiki.columns)[1:-1]

['Neighborhood/City', '2017', '2018', '2019']

In [18]:
DC_Wiki = DC_Wiki.drop(list(DC_Wiki.columns)[1:-1],axis=1)
DC_Wiki.dropna(subset=['2020'],inplace=True)
DC_Wiki.head()

Unnamed: 0,Name,2020
1,Bresca,1 Michelin star
2,Fiola,1 Michelin star
3,Gravitas,1 Michelin star
4,Inn at Little Washington,3 Michelin stars
5,Kinship,1 Michelin star


In [19]:
# Anti merge
m = (pd
     .merge(left = PH_DC, 
            right = DC_Wiki, 
            how = 'outer', 
            left_on = "name", 
            right_on = "Name",
            indicator = True))
m.loc[m._merge=="right_only",:].drop(columns="_merge")

Unnamed: 0,name,url,phone,address,rating,review_count,neighborhood,price_range,category,region,Name,2020
1651,,,,,,,,,,,Inn at Little Washington,3 Michelin stars
1652,,,,,,,,,,,minibar,2 Michelin stars


In [20]:
# The Inn at Little Washington
DC_Wiki['Name'] = DC_Wiki.Name.replace('Inn at Little Washington','The Inn at Little Washington')

# minibar by José Andrés
DC_Wiki['Name'] = DC_Wiki.Name.replace('minibar','minibar by José Andrés')

In [21]:
DC_PW = (pd
         .merge(left = PH_DC,
                right = DC_Wiki,
                how = "left",
                left_on = "name",
                right_on = "Name")
         .drop("Name", axis = 1)
         .rename(columns = {"2020":"michelin_star"}))

In [22]:
DC_PW['michelin_star'] = DC_PW.michelin_star.fillna(0)
DC_PW['michelin_star'] = DC_PW.michelin_star.replace('1 Michelin star',1)
DC_PW['michelin_star'] = DC_PW.michelin_star.replace('2 Michelin stars',2)
DC_PW['michelin_star'] = DC_PW.michelin_star.replace('3 Michelin stars',3)

DC_PW.head()

Unnamed: 0,name,url,phone,address,rating,review_count,neighborhood,price_range,category,region,michelin_star
0,A Baked Joint,https://www.yelp.com/biz/a-baked-joint-washing...,(202) 408-6985,430 K St NW,4.5,1918,,1.0,"[Coffee & Tea, Breakfast & Brunch, Sandwiches]",Washington DC,0
1,Zaytinya,https://www.yelp.com/biz/zaytinya-washington,(202) 638-0800,701 9th St NW,4.0,4687,,3.0,"[Greek, Turkish, Lebanese]",Washington DC,0
2,Le Diplomate,https://www.yelp.com/biz/le-diplomate-washington,(202) 332-3333,1601 14th St NW,4.5,3607,Logan Circle,3.0,"[Brasseries, French, Breakfast & Brunch]",Washington DC,0
3,Unconventional Diner,https://www.yelp.com/biz/unconventional-diner-...,(202) 847-0122,1207 9th St NW,4.5,1166,Shaw,2.0,"[American (New), Breakfast & Brunch]",Washington DC,0
4,Old Ebbitt Grill,https://www.yelp.com/biz/old-ebbitt-grill-wash...,(202) 347-4800,675 15th St NW,4.0,8513,,2.0,"[Bars, American (Traditional), Breakfast & Bru...",Washington DC,0


In [23]:
DC_Wiki.shape

(18, 2)

In [24]:
DC_PW.query('michelin_star != 0').shape

(18, 11)

## Merge DC_PW and Michelin

In [25]:
# Read in the data
Michelin = pd.read_csv("Data/Michelin/Michelin.csv")
DC_MG = Michelin.loc[Michelin['region'] == "Washington DC"].drop(['region','url'],axis=1)
DC_MG['name'] = DC_MG.name.str.replace('’',"'")
DC_MG.head()

Unnamed: 0,name,michelin_guide
0,Himitsu,1
1,BlackSalt,1
2,Tico,1
3,Ottoman Taverna,1
4,1789,1


In [26]:
DC_PW['name'] = DC_PW['name'].str.lower()
DC_MG['name'] = DC_MG['name'].str.lower()

In [27]:
# Anti merge
m = (pd
     .merge(left = DC_PW, 
            right = DC_MG, 
            how = 'outer', 
            on = "name",  
            indicator = True))
m.loc[m._merge=="right_only",:].drop(columns="_merge")

Unnamed: 0,name,url,phone,address,rating,review_count,neighborhood,price_range,category,region,michelin_star,michelin_guide
1651,himitsu,,,,,,,,,,,1.0
1652,tico,,,,,,,,,,,1.0
1653,1789,,,,,,,,,,,1.0
1654,all purpose,,,,,,,,,,,1.0
1655,marcel's,,,,,,,,,,,1.0
1656,astoria dc,,,,,,,,,,,1.0
1657,zenebech,,,,,,,,,,,1.0
1658,maydān,,,,,,,,,,,1.0
1659,das,,,,,,,,,,,1.0
1660,royal,,,,,,,,,,,1.0


In [28]:
# minibar by josé andrés
DC_MG['name'] = DC_MG.name.replace('minibar','minibar by josé andrés')

# pineapple and pearls
DC_PW['name'] = DC_PW.name.replace('pineapple & pearls','pineapple and pearls')

# Tico
DC_PW['name'] = DC_PW.name.replace('tico dc','tico')

# 1789
DC_MG['name'] = DC_MG.name.replace('1789','1789 restaurant')

# All-Purpose Shaw
DC_MG['name'] = DC_MG.name.replace('all purpose','all-purpose shaw')

# Marcel's by Robert Wiedmaier
DC_MG['name'] = DC_MG.name.replace("marcel's","marcel's by robert wiedmaier")

# Zenebech Restaurant
DC_MG['name'] = DC_MG.name.replace("zenebech","zenebech restaurant")

# Maydan
DC_MG['name'] = DC_MG.name.replace("maydān","maydan")

# Das Ethiopian
DC_MG['name'] = DC_MG.name.replace("das","das ethiopian")

# The Royal
DC_MG['name'] = DC_MG.name.replace("royal","the royal")

# The Bombay Club
DC_MG['name'] = DC_MG.name.replace("bombay club","the bombay club")

# Sfoglina Van Ness
DC_MG['name'] = DC_MG.name.replace("sfoglina","sfoglina van ness")

# chercher ethiopian restaurant & mart
DC_MG['name'] = DC_MG.name.replace("chercher","chercher ethiopian restaurant & mart")

# millie's spring valley
DC_MG['name'] = DC_MG.name.replace("millie's","millie's spring valley")

# sonoma restaurant and wine bar
DC_MG['name'] = DC_MG.name.replace("sonoma","sonoma restaurant and wine bar")

# anxo cidery & pintxos bar
DC_MG['name'] = DC_MG.name.replace("anxo","anxo cidery & pintxos bar")

# hank's oyster bar - dupont circle
DC_MG['name'] = DC_MG.name.replace("hank's oyster bar","hank's oyster bar - dupont circle")

# timber pizza company
DC_MG['name'] = DC_MG.name.replace("timber pizza co","timber pizza company")

# joselito
DC_MG['name'] = DC_MG.name.replace("joselito casa de comidas","joselito")

# keren restaurant
DC_MG['name'] = DC_MG.name.replace("keren","keren restaurant")

# bidwell restaurant
DC_MG['name'] = DC_MG.name.replace("bidwell","bidwell restaurant")

In [29]:
# DC_PW[DC_PW.name.str.contains('bidwell')]

In [30]:
# Anti merge
m = (pd
     .merge(left = DC_PW, 
            right = DC_MG, 
            how = 'outer', 
            on = "name",  
            indicator = True))
m.loc[m._merge=="right_only",:].drop(columns="_merge")

Unnamed: 0,name,url,phone,address,rating,review_count,neighborhood,price_range,category,region,michelin_star,michelin_guide
1651,himitsu,,,,,,,,,,,1.0
1652,astoria dc,,,,,,,,,,,1.0
1653,hanumanh,,,,,,,,,,,1.0
1654,mirabelle,,,,,,,,,,,1.0
1655,tosca,,,,,,,,,,,1.0
1656,ana,,,,,,,,,,,1.0
1657,the pembroke,,,,,,,,,,,1.0
1658,san lorenzo,,,,,,,,,,,1.0
1659,spoken english,,,,,,,,,,,1.0
1660,american son,,,,,,,,,,,1.0


In [31]:
DC = (pd
      .merge(left = DC_PW,
             right = DC_MG,
             how = "left",
             on = "name",))

# Fill NaN in michelin_guide with 0
DC['michelin_guide'] = DC['michelin_guide'].fillna(0)
DC['michelin_guide'] = DC['michelin_guide'].astype(int)

DC.head()

Unnamed: 0,name,url,phone,address,rating,review_count,neighborhood,price_range,category,region,michelin_star,michelin_guide
0,a baked joint,https://www.yelp.com/biz/a-baked-joint-washing...,(202) 408-6985,430 K St NW,4.5,1918,,1.0,"[Coffee & Tea, Breakfast & Brunch, Sandwiches]",Washington DC,0,0
1,zaytinya,https://www.yelp.com/biz/zaytinya-washington,(202) 638-0800,701 9th St NW,4.0,4687,,3.0,"[Greek, Turkish, Lebanese]",Washington DC,0,1
2,le diplomate,https://www.yelp.com/biz/le-diplomate-washington,(202) 332-3333,1601 14th St NW,4.5,3607,Logan Circle,3.0,"[Brasseries, French, Breakfast & Brunch]",Washington DC,0,0
3,unconventional diner,https://www.yelp.com/biz/unconventional-diner-...,(202) 847-0122,1207 9th St NW,4.5,1166,Shaw,2.0,"[American (New), Breakfast & Brunch]",Washington DC,0,1
4,old ebbitt grill,https://www.yelp.com/biz/old-ebbitt-grill-wash...,(202) 347-4800,675 15th St NW,4.0,8513,,2.0,"[Bars, American (Traditional), Breakfast & Bru...",Washington DC,0,0


In [32]:
DC_MG.shape

(116, 2)

In [33]:
DC.query('michelin_guide == 1').shape

(105, 12)

In [34]:
# Export to a CSV file
DC.to_csv('Data/DC.csv',index=False)

In [35]:
# Build a function to clean ParseHub data
def PH_clean(df,region="New York City"):
    """
    This is a function that takes ParseHub data as an input and output a cleaned version.

    Args:
        df (DataFrame): raw ParseHub dataframe.
        region (str): region name.

    Returns:
        DataFrame: cleaned ParseHub dataframe.
    """
    colnames = ['name','url','phone','address','rating','review_count','price_category','neighborhood','index']
    df.columns = colnames
    
    # Drop sponsored businesses
    df.dropna(subset=['index'],inplace=True)
    df = df.drop('index',axis=1)

    # Drop duplicated businesses
    df.drop_duplicates(subset='name',keep="first",inplace=True)
    
    # Split price_category into two columns
    df[['price_range','category']] = df.price_category.str.rsplit("$",1,expand=True)
    df = df.drop('price_category',axis=1)

    # Save the indices of NaN in category
    swap = df.index[df['category'].isna()]

    # Fill NaN in category with price_range
    df['category'] = df['category'].fillna(df['price_range'])

    # Add back one $ to price_range
    df['price_range'] = df['price_range']+'$'

    # Replace missing values with NaN
    df['price_range'].loc[swap] = np.nan

    # Convert category into list of strings
    df['category'] = df.category.str.split(", ",expand=False)
    
    # Convert review_count to int
    df['review_count'] = df['review_count'].apply(pd.to_numeric,errors='coerce')
    df.dropna(subset=['review_count'],inplace=True)
    df['review_count'] = df['review_count'].astype(int)
    
    # Convert price_range to float
    df['price_range'] = df.price_range.replace('$$$$',4)
    df['price_range'] = df.price_range.replace('$$$',3)
    df['price_range'] = df.price_range.replace('$$',2)
    df['price_range'] = df.price_range.replace('$',1)
    
    df['name'] = df.name.str.replace('’',"'")
    df['name'] = df.name.str.replace(' - Temp. CLOSED',"")
    df['name'] = df.name.str.replace(' - CLOSED',"")
    df['name'] = df['name'].str.lower()
    df['region'] = region

    # Return data
    return df

## Clean PH_NY

In [36]:
# Read in the data
PH_NY = pd.read_csv("Data/ParseHub/PH_NY.csv")

# Print the shape of the data
PH_NY.shape

(8201, 9)

In [37]:
PH_NY = PH_clean(PH_NY,region="New York City")
PH_NY.shape

(4471, 10)

## Merge PH_NY and NY_Wiki

In [38]:
# Read in the data
NY_Wiki = pd.read_csv("Data/Wikipedia/Wiki_NY.csv")
NY_Wiki = NY_Wiki.drop(list(NY_Wiki.columns)[1:-1],axis=1)
NY_Wiki.dropna(subset=['2020'],inplace=True)
NY_Wiki['Name'] = NY_Wiki['Name'].str.lower()

In [39]:
# Anti merge
m = (pd
     .merge(left = PH_NY, 
            right = NY_Wiki, 
            how = 'outer', 
            left_on = "name", 
            right_on = "Name",
            indicator = True))
m.loc[m._merge=="right_only",:].drop(columns="_merge")

Unnamed: 0,name,url,phone,address,rating,review_count,neighborhood,price_range,category,region,Name,2020
4471,,,,,,,,,,,bar uchū,1 Michelin star
4472,,,,,,,,,,,benno,1 Michelin star
4473,,,,,,,,,,,blue hill at stone barns,2 Michelin stars
4474,,,,,,,,,,,the finch,1 Michelin star
4475,,,,,,,,,,,gotham bar and grill,1 Michelin star
4476,,,,,,,,,,,ichimura at uchū,2 Michelin stars
4477,,,,,,,,,,,the nomad,1 Michelin star
4478,,,,,,,,,,,peter luger steak house,1 Michelin star
4479,,,,,,,,,,,satsuki,1 Michelin star
4480,,,,,,,,,,,ukiyo,1 Michelin star


In [40]:
# blue hill
NY_Wiki['Name'] = NY_Wiki.Name.replace('blue hill at stone barns','blue hill')

# uchu
NY_Wiki['Name'] = NY_Wiki.Name.replace('ichimura at uchū','uchu')

# uchu
NY_Wiki['Name'] = NY_Wiki.Name.replace('ichimura at uchū','uchu')

# benno restaurant
NY_Wiki['Name'] = NY_Wiki.Name.replace('benno','benno restaurant')

# peter luger
NY_Wiki['Name'] = NY_Wiki.Name.replace('peter luger steak house','peter luger')

# wallse
NY_Wiki['Name'] = NY_Wiki.Name.replace('wallsé','wallse')

# the nomad restaurant
NY_Wiki['Name'] = NY_Wiki.Name.replace('the nomad','the nomad restaurant')

In [41]:
# PH_NY[PH_NY.name.str.contains('amane')]

In [42]:
# Anti merge
m = (pd
     .merge(left = PH_NY, 
            right = NY_Wiki, 
            how = 'outer', 
            left_on = "name", 
            right_on = "Name",
            indicator = True))
m.loc[m._merge=="right_only",:].drop(columns="_merge")

Unnamed: 0,name,url,phone,address,rating,review_count,neighborhood,price_range,category,region,Name,2020
4472,,,,,,,,,,,bar uchū,1 Michelin star
4473,,,,,,,,,,,the finch,1 Michelin star
4474,,,,,,,,,,,gotham bar and grill,1 Michelin star
4475,,,,,,,,,,,satsuki,1 Michelin star
4476,,,,,,,,,,,ukiyo,1 Michelin star


In [43]:
NY_PW = (pd
         .merge(left = PH_NY,
                right = NY_Wiki,
                how = "left",
                left_on = "name",
                right_on = "Name")
         .drop("Name", axis = 1)
         .rename(columns = {"2020":"michelin_star"}))

In [44]:
NY_PW['michelin_star'] = NY_PW.michelin_star.fillna(0)
NY_PW['michelin_star'] = NY_PW.michelin_star.replace('1 Michelin star',1)
NY_PW['michelin_star'] = NY_PW.michelin_star.replace('2 Michelin stars',2)
NY_PW['michelin_star'] = NY_PW.michelin_star.replace('3 Michelin stars',3)

In [45]:
NY_Wiki.shape

(75, 2)

In [46]:
NY_PW.query('michelin_star != 0').shape

(70, 11)

## Merge NY_PW and Michelin

In [47]:
NY_MG = Michelin.loc[Michelin['region'] == "New York City"].drop(['region','url'],axis=1)
NY_MG['name'] = NY_MG.name.str.replace('’',"'")
NY_MG['name'] = NY_MG['name'].str.lower()
NY_MG.shape

(513, 2)

In [48]:
# Anti merge
m = (pd
     .merge(left = NY_PW, 
            right = NY_MG, 
            how = 'outer', 
            on = "name",  
            indicator = True))
m.loc[m._merge=="right_only",:].drop(columns="_merge")

Unnamed: 0,name,url,phone,address,rating,review_count,neighborhood,price_range,category,region,michelin_star,michelin_guide
4472,maison harlem,,,,,,,,,,,1.0
4473,málà project,,,,,,,,,,,1.0
4474,coarse,,,,,,,,,,,1.0
4475,saint julivert fisherie,,,,,,,,,,,1.0
4476,lamalo,,,,,,,,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4804,da franco & tony ristorante,,,,,,,,,,,1.0
4805,pinch chinese,,,,,,,,,,,1.0
4806,enoteca maria,,,,,,,,,,,1.0
4807,i sodi,,,,,,,,,,,1.0


In [49]:
NY = (pd
      .merge(left = NY_PW,
             right = NY_MG,
             how = "left",
             on = "name",))

In [50]:
# Fill NaN in michelin_guide with 0
NY['michelin_guide'] = NY['michelin_guide'].fillna(0)
NY['michelin_guide'] = NY['michelin_guide'].astype(int)

star = NY[NY.michelin_star != 0].index
NY['michelin_guide'].loc[star] = 1

In [51]:
# Export to a CSV file
NY.to_csv('Data/NY.csv',index=False)

## Clean PH_CH

In [52]:
# Read in the data
PH_CH = pd.read_csv("Data/ParseHub/PH_CH.csv")

# Print the shape of the data
PH_CH.shape

(5172, 9)

In [53]:
PH_CH = PH_clean(PH_CH,region="Chicago")
PH_CH.shape

(2565, 10)

## Merge PH_NY and NY_Wiki

In [54]:
# Read in the data
CH_Wiki = pd.read_csv("Data/Wikipedia/Wiki_Chicago.csv")
CH_Wiki = CH_Wiki.drop(list(CH_Wiki.columns)[1:-1],axis=1)
CH_Wiki.dropna(subset=['2020'],inplace=True)
CH_Wiki['Name'] = CH_Wiki['Name'].str.lower()

In [55]:
# Anti merge
m = (pd
     .merge(left = PH_CH, 
            right = CH_Wiki, 
            how = 'outer', 
            left_on = "name", 
            right_on = "Name",
            indicator = True))
m.loc[m._merge=="right_only",:].drop(columns="_merge")

Unnamed: 0,name,url,phone,address,rating,review_count,neighborhood,price_range,category,region,Name,2020
2565,,,,,,,,,,,elizabeth,1 Michelin star
2566,,,,,,,,,,,yūgen,1 Michelin star


In [56]:
# yugen
CH_Wiki['Name'] = CH_Wiki.Name.replace('yūgen','yugen')

# elizabeth restaurant
CH_Wiki['Name'] = CH_Wiki.Name.replace('elizabeth','elizabeth restaurant')

In [57]:
# PH_CH[PH_CH.name.str.contains('elske')]

In [58]:
CH_PW = (pd
         .merge(left = PH_CH,
                right = CH_Wiki,
                how = "left",
                left_on = "name",
                right_on = "Name")
         .drop("Name", axis = 1)
         .rename(columns = {"2020":"michelin_star"}))

In [59]:
CH_PW['michelin_star'] = CH_PW.michelin_star.fillna(0)
CH_PW['michelin_star'] = CH_PW.michelin_star.replace('1 Michelin star',1)
CH_PW['michelin_star'] = CH_PW.michelin_star.replace('2 Michelin stars',2)
CH_PW['michelin_star'] = CH_PW.michelin_star.replace('3 Michelin stars',3)

In [60]:
CH_Wiki.shape

(25, 2)

In [61]:
CH_PW.query('michelin_star != 0').shape

(25, 11)

## Merge CH_PW and Michelin

In [62]:
CH_MG = Michelin.loc[Michelin['region'] == "Chicago"].drop(['region','url'],axis=1)
CH_MG['name'] = CH_MG.name.str.replace('’',"'")
CH_MG['name'] = CH_MG['name'].str.lower()
CH_MG.shape

(176, 2)

In [63]:
# Anti merge
m = (pd
     .merge(left = CH_PW, 
            right = CH_MG, 
            how = 'outer', 
            on = "name",  
            indicator = True))
m.loc[m._merge=="right_only",:].drop(columns="_merge")

Unnamed: 0,name,url,phone,address,rating,review_count,neighborhood,price_range,category,region,michelin_star,michelin_guide
2565,pelago,,,,,,,,,,,1.0
2566,virtue,,,,,,,,,,,1.0
2567,kikkō,,,,,,,,,,,1.0
2568,demera,,,,,,,,,,,1.0
2569,staropolska,,,,,,,,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2621,mfk.,,,,,,,,,,,1.0
2622,sol de mexico,,,,,,,,,,,1.0
2623,the rosebud,,,,,,,,,,,1.0
2624,dusek's (board & beer),,,,,,,,,,,1.0


In [64]:
# yugen
CH_MG['name'] = CH_MG.name.replace('yūgen','yugen')

# elizabeth restaurant
CH_MG['name'] = CH_MG.name.replace('elizabeth','elizabeth restaurant')

# pelago ristorante
CH_MG['name'] = CH_MG.name.replace('pelago','pelago ristorante')

# dolo restaurant and bar
CH_MG['name'] = CH_MG.name.replace('dolo','dolo restaurant and bar')

# southport grocery & cafe
CH_MG['name'] = CH_MG.name.replace('southport grocery','southport grocery & cafe')

# mccb chicago 时尚食谱
CH_PW['name'] = CH_PW.name.replace('mccb chicago 时尚食谱','mccb')

# virtue restaurant
CH_MG['name'] = CH_MG.name.replace('virtue','virtue restaurant')

# haisous vietnamese kitchen
CH_MG['name'] = CH_MG.name.replace('haisous','haisous vietnamese kitchen')

# gibsons italia
CH_MG['name'] = CH_MG.name.replace("gibson's italia",'gibsons italia')

# dusek's board & beer
CH_MG['name'] = CH_MG.name.replace("dusek's (board & beer)","dusek's board & beer")

# beatnik west town
CH_MG['name'] = CH_MG.name.replace("beatnik","beatnik west town")

# mi tocaya antojeria
CH_MG['name'] = CH_MG.name.replace("mi tocaya","mi tocaya antojeria")

In [65]:
# CH_PW[CH_PW.name.str.contains('tocaya')]

In [66]:
CH = (pd
      .merge(left = CH_PW,
             right = CH_MG,
             how = "left",
             on = "name",))

In [67]:
# Fill NaN in michelin_guide with 0
CH['michelin_guide'] = CH['michelin_guide'].fillna(0)
CH['michelin_guide'] = CH['michelin_guide'].astype(int)

star = CH[CH.michelin_star != 0].index
CH['michelin_guide'].loc[star] = 1

In [68]:
# Export to a CSV file
CH.to_csv('Data/CH.csv',index=False)

## Clean PH_SF

In [69]:
# Read in the data
PH_SF = pd.read_csv("Data/ParseHub/PH_SF.csv")

# Print the shape of the data
PH_SF.shape

(3926, 9)

In [70]:
PH_SF = PH_clean(PH_SF,region="California")
PH_SF.shape

(1901, 10)

## Merge PH_SF and SF_Wiki

In [71]:
# Read in the data
SF_Wiki = pd.read_csv("Data/Wikipedia/Wiki_SF.csv")
SF_Wiki = SF_Wiki.drop('City/Neighborhood',axis=1)
SF_Wiki['Name'] = SF_Wiki['Name'].str.lower()
SF_Wiki

Unnamed: 0,Name,2019
0,acquerello,2 Michelin stars
1,al's place,1 Michelin star
2,angler,1 Michelin star
3,auberge du soleil,1 Michelin star
4,aubergine,1 Michelin star
...,...,...
57,the progress,1 Michelin star
58,the restaurant at meadowood,3 Michelin stars
59,the village pub,1 Michelin star
60,wako,1 Michelin star


In [72]:
# Anti merge
m = (pd
     .merge(left = PH_SF, 
            right = SF_Wiki, 
            how = 'outer', 
            left_on = "name", 
            right_on = "Name",
            indicator = True))
m.loc[m._merge=="right_only",:].drop(columns="_merge")

Unnamed: 0,name,url,phone,address,rating,review_count,neighborhood,price_range,category,region,Name,2019
1901,,,,,,,,,,,auberge du soleil,1 Michelin star
1902,,,,,,,,,,,aubergine,1 Michelin star
1903,,,,,,,,,,,baumé,2 Michelin stars
1904,,,,,,,,,,,bouchon,1 Michelin star
1905,,,,,,,,,,,chez tj,1 Michelin star
1906,,,,,,,,,,,commonwealth,1 Michelin star
1907,,,,,,,,,,,farmhouse inn & restaurant,1 Michelin star
1908,,,,,,,,,,,harbor house,1 Michelin star
1909,,,,,,,,,,,kenzo,1 Michelin star
1910,,,,,,,,,,,kin khao,1 Michelin star


In [73]:
# wako japanese restaurant
SF_Wiki['Name'] = SF_Wiki.Name.replace('wako','wako japanese restaurant')

# mourad restaurant
SF_Wiki['Name'] = SF_Wiki.Name.replace('mourad','mourad restaurant')

In [74]:
# PH_SF[PH_SF.name.str.contains('farmhouse')]

In [75]:
SF_PW = (pd
         .merge(left = PH_SF,
                right = SF_Wiki,
                how = "left",
                left_on = "name",
                right_on = "Name")
         .drop("Name", axis = 1)
         .rename(columns = {"2019":"michelin_star"}))

In [76]:
SF_PW['michelin_star'] = SF_PW.michelin_star.fillna(0)
SF_PW['michelin_star'] = SF_PW.michelin_star.replace('1 Michelin star',1)
SF_PW['michelin_star'] = SF_PW.michelin_star.replace('2 Michelin stars',2)
SF_PW['michelin_star'] = SF_PW.michelin_star.replace('3 Michelin stars',3)

In [77]:
SF_Wiki.shape

(62, 2)

In [78]:
SF_PW.query('michelin_star != 0').shape

(34, 11)

## Clean PH_LA

In [79]:
# Read in the data
PH_LA = pd.read_csv("Data/ParseHub/PH_LA.csv")

# Print the shape of the data
PH_LA.shape

(5348, 9)

In [80]:
PH_LA = PH_clean(PH_LA,region="California")
PH_LA.shape

(3109, 10)

## Merge PH_LA and LA_Wiki

In [81]:
# Read in the data
LA_Wiki = pd.read_csv("Data/Wikipedia/Wiki_LA.csv")
LA_Wiki = LA_Wiki.drop(list(LA_Wiki.columns)[1:-1],axis=1)
LA_Wiki.dropna(subset=['2019'],inplace=True)
LA_Wiki['Name'] = LA_Wiki['Name'].str.lower()
LA_Wiki

Unnamed: 0,Name,2019
0,addison,1 Michelin star
3,bistro na's,1 Michelin star
4,cut by wolfgang puck,1 Michelin star
5,dialogue,1 Michelin star
6,hana re,1 Michelin star
8,hayato,1 Michelin star
10,kali,1 Michelin star
11,kato,1 Michelin star
14,le comptoir,1 Michelin star
16,maude,1 Michelin star


In [82]:
# Anti merge
m = (pd
     .merge(left = PH_LA, 
            right = LA_Wiki, 
            how = 'outer', 
            left_on = "name", 
            right_on = "Name",
            indicator = True))
m.loc[m._merge=="right_only",:].drop(columns="_merge")

Unnamed: 0,name,url,phone,address,rating,review_count,neighborhood,price_range,category,region,Name,2019
3109,,,,,,,,,,,addison,1 Michelin star
3110,,,,,,,,,,,cut by wolfgang puck,1 Michelin star
3111,,,,,,,,,,,dialogue,1 Michelin star
3112,,,,,,,,,,,hana re,1 Michelin star
3113,,,,,,,,,,,kali,1 Michelin star
3114,,,,,,,,,,,shunji,1 Michelin star
3115,,,,,,,,,,,trois mec,1 Michelin star
3116,,,,,,,,,,,urasawa,2 Michelin stars


In [83]:
# cut
LA_Wiki['Name'] = LA_Wiki.Name.replace('cut by wolfgang puck','cut')

# shunji japanese cuisine
LA_Wiki['Name'] = LA_Wiki.Name.replace('shunji','shunji japanese cuisine')

# kali restaurant
LA_Wiki['Name'] = LA_Wiki.Name.replace('kali','kali restaurant')

# hanare sushi
LA_Wiki['Name'] = LA_Wiki.Name.replace('hana re','hanare sushi')

In [84]:
# PH_LA[PH_LA.name.str.contains('trois')]

In [85]:
# Anti merge
m = (pd
     .merge(left = PH_LA, 
            right = LA_Wiki, 
            how = 'outer', 
            left_on = "name", 
            right_on = "Name",
            indicator = True))
m.loc[m._merge=="right_only",:].drop(columns="_merge")

Unnamed: 0,name,url,phone,address,rating,review_count,neighborhood,price_range,category,region,Name,2019
3109,,,,,,,,,,,addison,1 Michelin star
3110,,,,,,,,,,,dialogue,1 Michelin star
3111,,,,,,,,,,,trois mec,1 Michelin star
3112,,,,,,,,,,,urasawa,2 Michelin stars


In [86]:
LA_PW = (pd
         .merge(left = PH_LA,
                right = LA_Wiki,
                how = "left",
                left_on = "name",
                right_on = "Name")
         .drop("Name", axis = 1)
         .rename(columns = {"2019":"michelin_star"}))

In [87]:
LA_PW['michelin_star'] = LA_PW.michelin_star.fillna(0)
LA_PW['michelin_star'] = LA_PW.michelin_star.replace('1 Michelin star',1)
LA_PW['michelin_star'] = LA_PW.michelin_star.replace('2 Michelin stars',2)
LA_PW['michelin_star'] = LA_PW.michelin_star.replace('3 Michelin stars',3)

In [88]:
LA_Wiki.shape

(27, 2)

In [89]:
LA_PW.query('michelin_star != 0').shape

(23, 11)

In [90]:
CA_PW = pd.concat([SF_PW,LA_PW],sort=False)
CA_PW.sample(5)

Unnamed: 0,name,url,phone,address,rating,review_count,neighborhood,price_range,category,region,michelin_star
137,pollos asados al carbón el güero,https://www.yelp.com/biz/pollos-asados-al-carb...,Florence,274 E Slauson Ave,4.0,37,,1.0,[Mexican],California,0
267,redbird,https://www.yelp.com/biz/redbird-los-angeles-2,(213) 788-1191,114 E 2nd St,4.0,1474,Downtown,3.0,"[Cocktail Bars, American (New), Breakfast & Br...",California,0
1718,uncle tetsu,https://www.yelp.com/biz/uncle-tetsu-torrance-2,(323) 275-9190,3525 W Carson St.,4.5,192,,1.0,"[Desserts, Japanese]",California,0
814,la michoacana restaurant,https://www.yelp.com/biz/la-michoacana-restaur...,(323) 505-0120,1625 East Cesar Chavez E,4.0,27,Boyle Heights,2.0,"[Mexican, Seafood]",California,0
29,sotto mare oysteria & seafood,https://www.yelp.com/biz/sotto-mare-oysteria-a...,(415) 398-3181,Located in Dante Benedetti Hotel & Sro,4.5,3837,Located in Dante Benedetti Hotel & Sro,2.0,"[Seafood, Italian, Bars]",California,0


## Merge CA_PW and Michelin

In [91]:
CA_MG = Michelin.loc[Michelin['region'] == "California"].drop(['region','url'],axis=1)
CA_MG['name'] = CA_MG.name.str.replace('’',"'")
CA_MG['name'] = CA_MG['name'].str.lower()
CA_MG.shape

(624, 2)

In [92]:
# Anti merge
m = (pd
     .merge(left = CA_PW, 
            right = CA_MG, 
            how = 'outer', 
            on = "name",  
            indicator = True))
m.loc[m._merge=="right_only",:].drop(columns="_merge")

Unnamed: 0,name,url,phone,address,rating,review_count,neighborhood,price_range,category,region,michelin_star,michelin_guide
5012,café romanat,,,,,,,,,,,1.0
5013,dyafa,,,,,,,,,,,1.0
5014,cultura,,,,,,,,,,,1.0
5015,cowboy star,,,,,,,,,,,1.0
5016,el jardín,,,,,,,,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5450,sushi yoshizumi,,,,,,,,,,,1.0
5451,shakewell,,,,,,,,,,,1.0
5452,the fig café,,,,,,,,,,,1.0
5453,villon,,,,,,,,,,,1.0


In [93]:
# CH_PW[CH_PW.name.str.contains('tocaya')]

In [94]:
CA = (pd
      .merge(left = CA_PW,
             right = CA_MG,
             how = "left",
             on = "name",))

In [95]:
# Fill NaN in michelin_guide with 0
CA['michelin_guide'] = CA['michelin_guide'].fillna(0)
CA['michelin_guide'] = CA['michelin_guide'].astype(int)

star = CA[CA.michelin_star != 0].index
CA['michelin_guide'].loc[star] = 1

In [96]:
# Export to a CSV file
CA.to_csv('Data/CA.csv',index=False)