## Preparation

In [1]:
# Data processing
import pandas as pd
import numpy as np
import re

# Scraping web content
import requests # For downloading the website
from bs4 import BeautifulSoup # For parsing the website
import time # To put the system to sleep
import random # For random numbers

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Scrape Wiki DC

In [2]:
# Wiki DC Michelin url
url = 'https://en.wikipedia.org/wiki/List_of_Michelin_starred_restaurants_in_Washington,_D.C.'

# Download the webpage
page = requests.get(url)
page.status_code # 200 == Connection

200

In [3]:
# Parse the content
soup = BeautifulSoup(page.content,'html.parser')

In [4]:
# Title of the wiki page
soup.title.string

'List of Michelin starred restaurants in Washington, D.C. - Wikipedia'

In [5]:
# Get the right table
table = soup.find('table',{'class':'wikitable sortable'})

In [6]:
# Get the rows
rows = table.find_all('tr')
len(rows)

21

In [7]:
# Get the table header attributes
header = ([th.text.rstrip() for th in rows[0].find_all('th')] + 
          [re.sub(r'\(.*?\)\[.*?\]','',td.text).rstrip() for td in rows[0].find_all('td')])
header

['Name', 'Neighborhood/City', '2017', '2018', '2019', '2020']

In [8]:
# Build a function to extract star rating from an individual cell
def get_star(cell):
    """
    This is a function that takes cell as an input and output its value.
    """
    try:
        value = cell.find_all('img')[0]['alt']
    except IndexError:
        value = np.nan
    
    return value

In [9]:
# Create an empty DataFrame
Wiki_DC = pd.DataFrame(columns=header)

# Scrape and save to DataFrame
for row in rows[1:]:
    
    cells = row.find_all('td')
    
    Wiki_DC = Wiki_DC.append({header[0]:cells[0].text.rstrip(),
                              header[1]:cells[1].text.rstrip(),
                              header[2]:get_star(cells[2]),
                              header[3]:get_star(cells[3]),
                              header[4]:get_star(cells[4]),
                              header[5]:get_star(cells[5])},
                             ignore_index=True)

Wiki_DC

Unnamed: 0,Name,Neighborhood/City,2017,2018,2019,2020
0,Blue Duck Tavern,West End,1 Michelin star,1 Michelin star,1 Michelin star,
1,Bresca,Logan Circle,,,1 Michelin star,1 Michelin star
2,Fiola,Penn Quarter,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star
3,Gravitas,Ivy City,,,,1 Michelin star
4,Inn at Little Washington,"Washington, VA",2 Michelin stars,2 Michelin stars,3 Michelin stars,3 Michelin stars
5,Kinship,Mount Vernon Square,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star
6,Komi,Dupont Circle,,1 Michelin star,1 Michelin star,1 Michelin star
7,Maydan,U Street,,,,1 Michelin star
8,Masseria,Union Market,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star
9,Little Pearl,Eastern Market,,,,1 Michelin star


In [10]:
# Export to a CSV file
Wiki_DC.to_csv('Data/Wikipedia/Wiki_DC.csv',index=False)

## Scrape Wiki Chicago

In [11]:
# Wiki Chicago Michelin url
url = 'https://en.wikipedia.org/wiki/List_of_Michelin_starred_restaurants_in_Chicago'

# Download the webpage
page = requests.get(url)
page.status_code # 200 == Connection

200

In [12]:
# Parse the content
soup = BeautifulSoup(page.content,'html.parser')

In [13]:
# Title of the wiki page
soup.title.string

'List of Michelin starred restaurants in Chicago - Wikipedia'

In [14]:
# Get the right table
table = soup.find('table',{'class':'wikitable sortable'})

In [15]:
# Get the rows
rows = table.find_all('tr')
len(rows)

51

In [16]:
# Get the table header attributes
header = [re.sub(r'\(.*?\)\[.*?\]','',th.text).rstrip() for th in rows[0].find_all('th')]
header

['Name',
 'Neighborhood/City',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020']

In [17]:
# Get the length of the header
len(header)

12

In [18]:
# Update get_star function to a deluxe edition
def get_star_deluxe(cells=None,i=None):
    """
    This is a a deluxe edition of the get_star function.
    """
    try:
        value = cells[i].find_all('img')[0]['alt']
    except IndexError:
        value = np.nan
    
    return value

In [19]:
# Create an empty DataFrame
Wiki_Chicago = pd.DataFrame(columns=header)

# Scrape and save to DataFrame
for row in rows[1:]:
    
    cells = row.find_all('td')
    
    Wiki_Chicago = Wiki_Chicago.append({header[0]:cells[0].text.rstrip(),
                                        header[1]:cells[1].text.rstrip(),
                                        header[2]:get_star_deluxe(cells,2),
                                        header[3]:get_star_deluxe(cells,3),
                                        header[4]:get_star_deluxe(cells,4),
                                        header[5]:get_star_deluxe(cells,5),
                                        header[6]:get_star_deluxe(cells,6),
                                        header[7]:get_star_deluxe(cells,7),
                                        header[8]:get_star_deluxe(cells,8),
                                        header[9]:get_star_deluxe(cells,9),
                                        header[10]:get_star_deluxe(cells,10),
                                        header[11]:get_star_deluxe(cells,11)},
                             ignore_index=True)

Wiki_Chicago

Unnamed: 0,Name,Neighborhood/City,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,42 Grams,Uptown,,,,,2 Michelin stars,2 Michelin stars,2 Michelin stars,,,
1,Acadia,South Loop,,,1 Michelin star,1 Michelin star,1 Michelin star,2 Michelin stars,2 Michelin stars,2 Michelin stars,2 Michelin stars,2 Michelin stars
2,Alinea,Lincoln Park,3 Michelin stars,3 Michelin stars,3 Michelin stars,3 Michelin stars,3 Michelin stars,3 Michelin stars,3 Michelin stars,3 Michelin stars,3 Michelin stars,3 Michelin stars
3,Avenues,Magnificent Mile,2 Michelin stars,,,,,,,,,
4,Band of Bohemia,Ravenswood,,,,,,,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star
5,Blackbird,West Loop,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star
6,Boka,Lincoln Park,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star
7,Bonsoirée,Logan Square,1 Michelin star,1 Michelin star,,,,,,,,
8,Charlie Trotter's,Lincoln Park,2 Michelin stars,2 Michelin stars,,,,,,,,
9,Crofton on Wells,River North,1 Michelin star,,,,,,,,,


In [20]:
# Export to a CSV file
Wiki_Chicago.to_csv('Data/Wikipedia/Wiki_Chicago.csv',index=False)

## Scrape Wiki NY

In [21]:
# Wiki NY Michelin url
url = 'https://en.wikipedia.org/wiki/List_of_Michelin_starred_restaurants_in_New_York_City'

# Download the webpage
page = requests.get(url)
page.status_code # 200 == Connection

200

In [22]:
# Parse the content
soup = BeautifulSoup(page.content,'html.parser')

In [23]:
# Title of the wiki page
soup.title.string

'List of Michelin starred restaurants in New York City - Wikipedia'

In [24]:
# Get the right table
table = soup.find('table',{'class':'wikitable sortable'})

In [25]:
# Get the rows
rows = table.find_all('tr')
len(rows)

177

In [26]:
# Get the table header attributes
pat1 = r'\(.*?\)\[.*?\]\[.*?\]'
pat2 = r'\(.*?\)\[.*?\]'
com_pat = r'|'.join((pat1, pat2))
header = [re.sub(com_pat,'',th.text).rstrip() for th in rows[0].find_all('th')]
header

['Name',
 'Borough/County',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020']

In [27]:
# Get the length of the header
len(header)

17

In [28]:
# Create an empty DataFrame
Wiki_NY = pd.DataFrame(columns=header)

# Scrape and save to DataFrame
for row in rows[1:]:
    
    cells = row.find_all('td')
    
    Wiki_NY = Wiki_NY.append({header[0]:cells[0].text.rstrip(),
                              header[1]:cells[1].text.rstrip(),
                              header[2]:get_star_deluxe(cells,2),
                              header[3]:get_star_deluxe(cells,3),
                              header[4]:get_star_deluxe(cells,4),
                              header[5]:get_star_deluxe(cells,5),
                              header[6]:get_star_deluxe(cells,6),
                              header[7]:get_star_deluxe(cells,7),
                              header[8]:get_star_deluxe(cells,8),
                              header[9]:get_star_deluxe(cells,9),
                              header[10]:get_star_deluxe(cells,10),
                              header[11]:get_star_deluxe(cells,11),
                              header[12]:get_star_deluxe(cells,12),
                              header[13]:get_star_deluxe(cells,13),
                              header[14]:get_star_deluxe(cells,14),
                              header[15]:get_star_deluxe(cells,15),
                              header[16]:get_star_deluxe(cells,16)},
                             ignore_index=True)

Wiki_NY

Unnamed: 0,Name,Borough/County,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,15 East,Manhattan,,,,,,,,1 Michelin star,1 Michelin star,1 Michelin star,,,,,
1,A Voce Columbus,Manhattan,,1 Michelin star,1 Michelin star,,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,,,,,,
2,A Voce Madison,Manhattan,,,,,,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,,,,,,
3,Adour,Manhattan,,,,2 Michelin stars,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,,,,,,,
4,Agern,Manhattan,,,,,,,,,,,,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,Vong,Manhattan,1 Michelin star,1 Michelin star,1 Michelin star,,,,,,,,,,,,
172,Wallsé,Manhattan,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star
173,wd~50,Manhattan,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,1 Michelin star,,,,,,
174,Zabb Elee,Queens,,,,,,,,,,1 Michelin star,,,,,


In [29]:
# Export to a CSV file
Wiki_NY.to_csv('Data/Wikipedia/Wiki_NY.csv',index=False)

## Scrape Wiki SF

In [30]:
# Wiki SF Michelin url
url = 'https://en.wikipedia.org/wiki/List_of_Michelin_starred_restaurants_in_San_Francisco_Bay_Area'

# Download the webpage
page = requests.get(url)
page.status_code # 200 == Connection

200

In [31]:
# Parse the content
soup = BeautifulSoup(page.content,'html.parser')

In [32]:
# Title of the wiki page
soup.title.string

'List of Michelin starred restaurants in San Francisco Bay Area - Wikipedia'

In [33]:
# Get the right table
table = soup.find('table',{'class':'wikitable sortable'})

In [34]:
# Get the rows
rows = table.find_all('tr')
len(rows)

63

In [35]:
# Get the table header attributes
header = [re.sub(r'\(.*?\)\[.*?\]','',th.text).rstrip() for th in rows[0].find_all('th')]
header

['Name', 'City/Neighborhood', '2019']

In [36]:
# Get the length of the header
len(header)

3

In [37]:
# Create an empty DataFrame
Wiki_SF = pd.DataFrame(columns=header)

# Scrape and save to DataFrame
for row in rows[1:]:
    
    cells = row.find_all('td')
    
    Wiki_SF = Wiki_SF.append({header[0]:cells[0].text.rstrip(),
                              header[1]:cells[1].text.rstrip(),
                              header[2]:get_star(cells[2])},
                             ignore_index=True)

Wiki_SF

Unnamed: 0,Name,City/Neighborhood,2019
0,Acquerello,San Francisco - Polk Gulch,2 Michelin stars
1,Al's Place,San Francisco - Mission District,1 Michelin star
2,Angler,San Francisco - Financial District,1 Michelin star
3,Auberge du Soleil,Rutherford,1 Michelin star
4,Aubergine,Monterey,1 Michelin star
...,...,...,...
57,The Progress,San Francisco - Fillmore,1 Michelin star
58,The Restaurant at Meadowood,St. Helena,3 Michelin stars
59,The Village Pub,Woodside,1 Michelin star
60,Wako,San Francisco - Richmond District,1 Michelin star


In [38]:
# Export to a CSV file
Wiki_SF.to_csv('Data/Wikipedia/Wiki_SF.csv',index=False)

## Scrape Wiki LA

In [39]:
# Wiki LA Michelin url
url = 'https://en.wikipedia.org/wiki/List_of_Michelin_starred_restaurants_in_Los_Angeles'

# Download the webpage
page = requests.get(url)
page.status_code # 200 == Connection

200

In [40]:
# Parse the content
soup = BeautifulSoup(page.content,'html.parser')

In [41]:
# Title of the wiki page
soup.title.string

'List of Michelin starred restaurants in Los Angeles - Wikipedia'

In [42]:
# Get the right table
table = soup.find('table',{'class':'wikitable sortable'})

In [43]:
# Get the rows
rows = table.find_all('tr')
len(rows)

45

In [44]:
# Get the table header attributes
header = [re.sub(r'\[.*?\]','',th.text).rstrip() for th in rows[0].find_all('th')]
header

['Name', 'City/Neighborhood', '2008', '2009', 'No Guide (2010-2018)', '2019']

In [45]:
# Get the length of the header
len(header)

6

In [46]:
# Create an empty DataFrame
Wiki_LA = pd.DataFrame(columns=header)

# Scrape and save to DataFrame
for row in rows[1:]:
    
    cells = row.find_all('td')
    
    Wiki_LA = Wiki_LA.append({header[0]:cells[0].text.rstrip(),
                              header[1]:cells[1].text.rstrip(),
                              header[2]:get_star_deluxe(cells,2),
                              header[3]:get_star_deluxe(cells,3),
                              header[4]:get_star_deluxe(cells,4),
                              header[5]:get_star_deluxe(cells,5)},
                             ignore_index=True)

Wiki_LA

Unnamed: 0,Name,City/Neighborhood,2008,2009,No Guide (2010-2018),2019
0,Addison,San Diego,,1 Michelin star,,
1,Asanebo,Los Angeles - Studio City,1 Michelin star,1 Michelin star,,
2,Bastide,West Hollywood,,1 Michelin star,,
3,Bistro Na's,Temple City,,1 Michelin star,,
4,CUT by Wolfgang Puck,Beverly Hills,1 Michelin star,1 Michelin star,,1 Michelin star
5,Dialogue,Santa Monica,,1 Michelin star,,
6,Hana Re,Costa Mesa,,1 Michelin star,,
7,Hatfield's,Los Angeles,,1 Michelin star,,
8,Hayato,Los Angeles,,1 Michelin star,,
9,Joe's Restaurant,Los Angeles - Venice,1 Michelin star,,,


In [47]:
# Export to a CSV file
Wiki_LA.to_csv('Data/Wikipedia/Wiki_LA.csv',index=False)