## Preparation

In [1]:
# Data processing
import pandas as pd
import country_converter as coco

# Scraping web content
import requests # For downloading the website
from bs4 import BeautifulSoup # For parsing the website
import time # To put the system to sleep
import random # For random numbers

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Extract Relevant Links

In [2]:
# Yelp DC restaurants url
DC_url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC'

# Download the webpage
DC_page = requests.get(DC_url)
DC_page.status_code # 200 == Connection

200

In [3]:
# Parse the content
DC_soup = BeautifulSoup(DC_page.content,'html.parser')

In [4]:
# Extract page urls
DC_urls = set([DC_url])
DC_page_no = 24

for i in range(10, DC_page_no*10, 10):
    DC_urls.update([DC_url+'&start='+str(i)])

DC_urls

{'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=10',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=100',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=110',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=120',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=130',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=140',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=150',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=160',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=170',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=180',

In [5]:
# Extract relevant links for one page
links = set()

for tag in DC_soup.select('span > a'):
    href = tag.attrs.get('href')
    if '?osq=Restaurants' in href and 'https:' not in href:
        links.update(['https://www.yelp.com'+href])

links

{'https://www.yelp.com/biz/butter-me-up-washington?osq=Restaurants',
 'https://www.yelp.com/biz/cane-washington-3?osq=Restaurants',
 'https://www.yelp.com/biz/gypsy-kitchen-washington?osq=Restaurants',
 'https://www.yelp.com/biz/il-canale-washington-2?osq=Restaurants',
 'https://www.yelp.com/biz/le-diplomate-washington?osq=Restaurants',
 'https://www.yelp.com/biz/mr-braxton-bar-and-kitchen-washington?osq=Restaurants',
 'https://www.yelp.com/biz/old-ebbitt-grill-washington?osq=Restaurants',
 'https://www.yelp.com/biz/reren-washington?osq=Restaurants',
 'https://www.yelp.com/biz/the-alibi-washington?osq=Restaurants',
 'https://www.yelp.com/biz/the-block-washington?osq=Restaurants'}

In [6]:
# Build a scraper to extract relevant links for multiple pages
def page_scraper(urls=None,sleep=3):
    """
    Scrape Yelp urls to extract relevant links.

    Args:
        urls (list): list of Yelp urls.
        sleep (int): integer value specifying how long the machine should be put to sleep (random uniform); defaults to 3.

    Returns:
        set: set containing all relevant restaurant links.
    """
    links = set()

    for url in urls:

        # Keep track of where we are at
        print(url)

        # Download the webpage
        page = requests.get(url)

        # If a connection was reached
        if page.status_code == 200:

                # Parse
                soup = BeautifulSoup(page.content,'html.parser')

                for tag in soup.select('span > a'):
                    href = tag.attrs.get('href')
                    if '?osq=Restaurants' in href and 'https:' not in href:
                        links.update(['https://www.yelp.com'+href])

        # Put the system to sleep for a random draw of time
        time.sleep(random.uniform(0,sleep))

    # Return data
    return links

In [7]:
# Scrape Yelp DC page
DC_links = page_scraper(urls=DC_urls)

https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=200
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=170
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=120
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=140
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=160
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=150
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=130
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=60
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=210
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=100
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=190
https://www.yelp.com/search?find_

In [8]:
# View DC_links
DC_links

{'https://www.yelp.com/biz/14th-st-cafe-asian-bistro-washington-2?osq=Restaurants',
 'https://www.yelp.com/biz/1914-by-kolben-washington?osq=Restaurants',
 'https://www.yelp.com/biz/a-baked-joint-washington-9?osq=Restaurants',
 'https://www.yelp.com/biz/agora-washington-4?osq=Restaurants',
 'https://www.yelp.com/biz/akira-ramen-and-izakaya-dc-washington-2?osq=Restaurants',
 'https://www.yelp.com/biz/al-volo-trattoria-and-bar-washington?osq=Restaurants',
 'https://www.yelp.com/biz/albi-washington?osq=Restaurants',
 'https://www.yelp.com/biz/ambar-arlington-4?osq=Restaurants',
 'https://www.yelp.com/biz/ambar-washington?osq=Restaurants',
 'https://www.yelp.com/biz/anafre-washington-2?osq=Restaurants',
 'https://www.yelp.com/biz/anju-washington?osq=Restaurants',
 'https://www.yelp.com/biz/ankara-washington?osq=Restaurants',
 'https://www.yelp.com/biz/arepa-zone-washington-5?osq=Restaurants',
 'https://www.yelp.com/biz/astoria-dc-washington?osq=Restaurants',
 'https://www.yelp.com/biz/baan

In [9]:
# Check number of DC_links
len(DC_links)

240

## Scrape Yelp DC

In [10]:
# Create an empty DataFrame
Yelp_DC = pd.DataFrame(columns=['business_name','rating',
                                'review_count','price_category'])

for tag in DC_soup.select('[class*=container]'):
    if tag.find('h4'):
        Yelp_DC = Yelp_DC.append({'business_name':tag.select('h4')[0].get_text(),
                                  'rating':tag.select('[class*=stars]')[0]['aria-label'],
                                  'review_count':tag.select('[class*=reviewCount]')[0].get_text(),
                                  'price_category':tag.select('[class*=priceCategory]')[0].get_text()},
                                 ignore_index=True)

Yelp_DC

Unnamed: 0,business_name,rating,review_count,price_category
0,1. Le Diplomate,4.5 star rating,3596,"$$$Brasseries, French, Breakfast & Brunch"
1,2. Gypsy Kitchen,4.5 star rating,70,"Tapas/Small Plates, Mediterranean"
2,3. Butter Me Up,4.5 star rating,103,Breakfast & Brunch
3,4. The Block,5 star rating,12,"Food Court, Bars, Asian Fusion"
4,5. The Alibi,4.5 star rating,441,"$$Pubs, Sandwiches, Barbeque"
5,6. Old Ebbitt Grill,4 star rating,8511,"$$Bars, American (Traditional), Breakfast & Br..."
6,7. il Canale,4.5 star rating,3239,"$$Italian, Pizza"
7,8. Mr Braxton Bar & Kitchen,4.5 star rating,198,"$$American (New), Cocktail Bars"
8,9. Cane,4.5 star rating,278,$$Caribbean
9,10. Reren,4 star rating,1899,"$$Ramen, Asian Fusion"


In [11]:
# Build a scraper for Yelp links
def Yelp_scraper(url=None):
    """
    Scrape a Yelp url to extract business information.

    Args:
        url (str): string of a Yelp url.

    Returns:
        DataFrame: frame containing business information of the url.
    """
    # Download the webpage
    page = requests.get(url)

    # If a connection was reached
    if page.status_code == 200:

        # Parse
        soup = BeautifulSoup(page.content,'html.parser')

        # Create an empty DataFrame
        info = pd.DataFrame(columns=['business_name','rating','review_count','price_category'])

        for tag in soup.select('[class*=container]'):
            if tag.find('h4'):
                info = info.append({'business_name':tag.select('h4')[0].get_text(),
                                    'rating':tag.select('[class*=stars]')[0]['aria-label'],
                                    'review_count':tag.select('[class*=reviewCount]')[0].get_text(),
                                    'price_category':tag.select('[class*=priceCategory]')[0].get_text()},
                                   ignore_index=True)

    # Return data
    return info

In [12]:
# Scrape DC_url
Yelp_scraper(url=DC_url)

Unnamed: 0,business_name,rating,review_count,price_category
0,Crystal City Restaurant Gentleman’s Club,3 star rating,83,"$$Strip Clubs, American (New), Bars"
1,1. Le Diplomate,4.5 star rating,3596,"$$$Brasseries, French, Breakfast & Brunch"
2,2. Gypsy Kitchen,4.5 star rating,70,"Tapas/Small Plates, Mediterranean"
3,3. Butter Me Up,4.5 star rating,103,Breakfast & Brunch
4,4. The Block,5 star rating,12,"Food Court, Bars, Asian Fusion"
5,5. The Alibi,4.5 star rating,441,"$$Pubs, Sandwiches, Barbeque"
6,6. Old Ebbitt Grill,4 star rating,8511,"$$Bars, American (Traditional), Breakfast & Br..."
7,7. il Canale,4.5 star rating,3239,"$$Italian, Pizza"
8,8. Mr Braxton Bar & Kitchen,4.5 star rating,198,"$$American (New), Cocktail Bars"
9,9. Cane,4.5 star rating,278,$$Caribbean


In [13]:
# Build a function to scrape multiple Yelp links
def link_scrape(urls=None,sleep=3):
    """
    Scrape multiple Yelp links.

    Args:
        urls (list): list of Yelp urls.
        sleep (int): integer value specifying how long the machine should be put to sleep (random uniform); defaults to 3.

    Returns:
        DataFrame: frame containing business information of all urls.
    """
    dat = pd.DataFrame([])

    for url in urls:

        print(url) # Keep track of where we are at

        try:

            # Scrape the content
            dat = dat.append(Yelp_scraper(url))

            # Put the system to sleep for a random draw of time
            time.sleep(random.uniform(0,sleep))

        except ImportError:
            pass

    dat = dat.reset_index(drop=True)
    return dat

In [14]:
# Scrape DC_urls
DC_Yelp = link_scrape(urls=DC_urls)

https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=200
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=170
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=120
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=140
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=160
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=150
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=130
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=60
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=210
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=100
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=190
https://www.yelp.com/search?find_

In [15]:
# View a random sample of DC_yelp
DC_Yelp.sample(10)

Unnamed: 0,business_name,rating,review_count,price_category
63,159. Glassey,4.5 star rating,32,Indian
76,Crystal City Restaurant Gentleman’s Club,3 star rating,83,"$$Strip Clubs, American (New), Bars"
23,121. Medium Rare,4 star rating,945,"$$Steakhouses, American (New), Desserts"
209,52. Da Hong Pao,4 star rating,702,"$$Cantonese, Dim Sum, Szechuan"
26,124. Emilie’s,4 star rating,122,American (New)
181,7. il Canale,4.5 star rating,3239,"$$Italian, Pizza"
179,5. The Alibi,4.5 star rating,441,"$$Pubs, Sandwiches, Barbeque"
136,185. Gogi Yogi,3.5 star rating,156,"$$Korean, Barbeque"
151,79. A Baked Joint,4.5 star rating,1912,"$Coffee & Tea, Breakfast & Brunch, Sandwiches"
235,236. Logan Tavern,4 star rating,872,"$$American (New), Sports Bars, American (Tradi..."


In [16]:
# Export DC_yelp to a CSV file
DC_Yelp.to_csv('Data/Yelp/DC_Yelp.csv', index = False)

## Scrape Yelp Chicago

In [17]:
# Yelp Chicago restaurants url
Chicago_url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL'

# Download the webpage
Chicago_page = requests.get(Chicago_url)
Chicago_page.status_code # 200 == Connection

200

In [18]:
# Parse the content
Chicago_soup = BeautifulSoup(Chicago_page.content,'html.parser')

In [19]:
# Extract page urls
Chicago_urls = set([Chicago_url])
Chicago_page_no = 24

for i in range(10, Chicago_page_no*10, 10):
    Chicago_urls.update([Chicago_url+'&start='+str(i)])

Chicago_urls

{'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=10',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=100',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=110',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=120',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=130',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=140',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=150',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=160',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=170',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=180',
 'https://www.yelp.com/search?fi

In [20]:
# Scrape Chicago_urls
Chicago_Yelp = link_scrape(urls=Chicago_urls)

https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=210
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=200
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=50
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=220
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=110
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=130
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=100
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=10
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=180
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&start=70
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Chicago%2C%20IL&sta

In [21]:
# View a random sample of Chicago_yelp
Chicago_Yelp.sample(10)

Unnamed: 0,business_name,rating,review_count,price_category
212,93. Kai Sushi,4 star rating,636,"$$Sushi Bars, Japanese"
21,210. Mr Dumpling,4 star rating,44,Chinese
97,19. Bienmesabe,4 star rating,452,$$Venezuelan
240,240. Old Irving Brewing,4 star rating,494,"$$Breweries, American (New), Burgers"
81,4. Girl & the Goat,4.5 star rating,8688,"$$$American (New), Bakeries, Coffee & Tea"
1,211. Piccolo Sogno,4 star rating,1245,$$$Italian
226,26. Forbidden Root,4.5 star rating,473,"$$Brewpubs, Tapas/Small Plates, Cocktail Bars"
2,212. The Jibarito Stop,4.5 star rating,292,$Puerto Rican
77,Amazon Go,4 star rating,18,"Convenience Stores, Fast Food, Salad"
227,27. RAMEN-SAN,4 star rating,2280,$$Ramen


In [22]:
# Export Chicago_yelp to a CSV file
Chicago_Yelp.to_csv('Data/Yelp/Chicago_Yelp.csv', index = False)

## Scrape Yelp New York

In [23]:
# Yelp NY restaurants url
NY_url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY'

# Download the webpage
NY_page = requests.get(NY_url)
NY_page.status_code # 200 == Connection

200

In [24]:
# Parse the content
NY_soup = BeautifulSoup(NY_page.content,'html.parser')

In [25]:
# Extract page urls
NY_urls = set([NY_url])
NY_page_no = 24

for i in range(10, NY_page_no*10, 10):
    NY_urls.update([NY_url+'&start='+str(i)])

NY_urls

{'https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=10',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=100',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=110',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=120',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=130',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=140',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=150',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=160',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=170',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=180',

In [26]:
# Scrape NY_urls
NY_Yelp = link_scrape(urls=NY_urls)

https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=210
https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=230
https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=70
https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY
https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=50
https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=30
https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=100
https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=120
https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=130
https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=10
https://www.yelp.com/search?find_desc=Restaurants&find_loc=New%20York%2C%20NY&start=160
https://www.yelp.com/search?find_desc=Restaura

In [27]:
# View a random sample of NY_yelp
NY_Yelp.sample(10)

Unnamed: 0,business_name,rating,review_count,price_category
256,143. Raku,4.5 star rating,1098,"$$Japanese, Noodles"
116,166. Benemon,4.5 star rating,570,"$$Japanese, Comfort Food, Tapas/Small Plates"
79,122. Shuka,4 star rating,493,"$$Mediterranean, Cocktail Bars"
40,7. Hometown Bar-B-Que,4 star rating,1405,"$$Barbeque, Smokehouse"
251,69. Madre,5 star rating,66,$$$American (New)
199,91. Bunna Cafe,4.5 star rating,634,"$$Ethiopian, Vegan, African"
58,33. Aoi Kitchen,4.5 star rating,90,"$$Izakaya, Japanese Curry"
7,217. Khe-Yo,4 star rating,776,$$Laotian
254,141. Sweet Chick,4 star rating,1662,"$$Southern, Breakfast & Brunch, American (Trad..."
113,163. Claw Daddy’s,4 star rating,1711,"$$Cajun/Creole, Seafood"


In [28]:
# Export NY_yelp to a CSV file
NY_Yelp.to_csv('Data/Yelp/NY_Yelp.csv', index = False)

## Scrape Yelp San Francisco Bay Area

In [29]:
# Yelp SF restaurants url
SF_url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA'

# Download the webpage
SF_page = requests.get(SF_url)
SF_page.status_code # 200 == Connection

200

In [30]:
# Parse the content
SF_soup = BeautifulSoup(NY_page.content,'html.parser')

In [34]:
# Extract page urls
SF_urls = set([SF_url])
SF_page_no = 24

for i in range(10, SF_page_no*10, 10):
    SF_urls.update([SF_url+'&start='+str(i)])

SF_urls

{'https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA&start=10',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA&start=100',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA&start=110',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA&start=120',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA&start=130',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA&start=140',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA&start=150',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA&start=160',
 'https://www.yelp.co

In [35]:
# Scrape SF_urls
SF_Yelp = link_scrape(urls=SF_urls)

https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA&start=90
https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA&start=190
https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA&start=10
https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA&start=210
https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA&start=230
https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA&start=50
https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA&start=130
https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA&start=30
https://www.yelp.com/search?find_desc=Restaurants&find_loc=San%20Francisco%20Bay%20Area%2C%20CA&start=150
https://www.yelp.com/search?find_desc=Restaurants&

In [36]:
# View a random sample of SF_yelp
SF_Yelp.sample(10)

Unnamed: 0,business_name,rating,review_count,price_category
210,81. Cambodian Street Food,4.5 star rating,138,Cambodian
85,38. Smish Smash,4.5 star rating,27,Burgers
91,153. Rio California,4.5 star rating,440,"$Brazilian, American (New)"
105,46. Pho Huong Que,4.5 star rating,98,$$Vietnamese
93,155. The Lunch Box,4.5 star rating,462,"$Sandwiches, American (New), Salad"
155,121. Mill’s Hoagie & Deli Shop,4 star rating,306,"$Sandwiches, Delis, Cheesesteaks"
208,210. The Mana’eesh Lady,5 star rating,1,"Pop-Up Restaurants, Middle Eastern"
134,142. Cam Huong Cafe,4 star rating,443,"$Vietnamese, Delis, Sandwiches"
233,172. Tasty Pot,3.5 star rating,559,"$$Taiwanese, Bubble Tea, Hot Pot"
244,182. Millennium,4 star rating,739,"$$$Vegan, Vegetarian, Wine Bars"


In [37]:
# Export SF_yelp to a CSV file
SF_Yelp.to_csv('Data/Yelp/SF_Yelp.csv', index = False)

## Scrape Yelp Los Angeles

In [38]:
# Yelp LA restaurants url
LA_url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA'

# Download the webpage
LA_page = requests.get(LA_url)
LA_page.status_code # 200 == Connection

200

In [39]:
# Parse the content
LA_soup = BeautifulSoup(LA_page.content,'html.parser')

In [40]:
# Extract page urls
LA_urls = set([LA_url])
LA_page_no = 24

for i in range(10, LA_page_no*10, 10):
    LA_urls.update([LA_url+'&start='+str(i)])

LA_urls

{'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=10',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=100',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=110',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=120',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=130',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=140',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=150',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=160',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=170',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=

In [41]:
# Scrape LA_urls
LA_Yelp = link_scrape(urls=LA_urls)

https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=50
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=150
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=10
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=100
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=160
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=70
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=80
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=170
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=20
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA&start=140
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Los%20Angeles%2C%20CA
https://www.ye

In [42]:
# View a random sample of LA_yelp
LA_Yelp.sample(10)

Unnamed: 0,business_name,rating,review_count,price_category
125,234. Sonoritas Prime Tacos,4 star rating,1196,$$Mexican
69,83. Taqueria Los Anaya,4.5 star rating,1211,$$Mexican
218,129. Osteria Mozza,4 star rating,2816,"$$$$Italian, Wine Bars"
175,70. Rutt’s Hawaiian Cafe,4 star rating,1773,$$Hawaiian
229,209. Mama Shelter Rooftop Bar,3.5 star rating,509,"$$American (New), Cocktail Bars, Wine Bars"
19,158. Alta West Adams,4 star rating,345,"$$Soul Food, Venues & Event Spaces"
232,191. Berlins,4.5 star rating,1001,"$Bubble Tea, German, Sandwiches"
18,157. Dulan’s On Crenshaw,4 star rating,660,$$Soul Food
243,41. My Two Cents,4.5 star rating,700,"$$Southern, Seafood, Soul Food"
202,34. Papi Tacos & Churros,4 star rating,410,Tacos


In [43]:
# Export LA_yelp to a CSV file
LA_Yelp.to_csv('Data/Yelp/LA_Yelp.csv', index = False)