In [1]:
# Data processing
import pandas as pd
import country_converter as coco

# Scraping web content
import requests
from bs4 import BeautifulSoup
import time
import random

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Yelp DC restaurants url
DC_url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC'

# Download the webpage
DC_page = requests.get(DC_url)
DC_page.status_code # 200 == Connection

200

In [3]:
# Parse the content
DC_soup = BeautifulSoup(DC_page.content,'html.parser')

In [4]:
# Extract page urls
urls = set()
page_no = 24

for i in range(10, page_no*10, 10):
    urls.update([DC_url+'&start='+str(i)])

urls

{'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=10',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=100',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=110',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=120',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=130',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=140',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=150',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=160',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=170',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=180',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&s

In [5]:
# Extract relevant links for one page
links = set()

for tag in DC_soup.select('span > a'):
    href = tag.attrs.get('href')
    if '?osq=Restaurants' in href and 'https:' not in href:
        links.update(['https://www.yelp.com'+href])

links

{'https://www.yelp.com/biz/butter-me-up-washington?osq=Restaurants',
 'https://www.yelp.com/biz/founding-farmers-washington-washington-3?osq=Restaurants',
 'https://www.yelp.com/biz/gypsy-kitchen-washington?osq=Restaurants',
 'https://www.yelp.com/biz/hot-n-juicy-crawfish-washington?osq=Restaurants',
 'https://www.yelp.com/biz/il-canale-washington-2?osq=Restaurants',
 'https://www.yelp.com/biz/le-diplomate-washington?osq=Restaurants',
 'https://www.yelp.com/biz/mr-braxton-bar-and-kitchen-washington?osq=Restaurants',
 'https://www.yelp.com/biz/old-ebbitt-grill-washington?osq=Restaurants',
 'https://www.yelp.com/biz/prost-washington-2?osq=Restaurants',
 'https://www.yelp.com/biz/the-alibi-washington?osq=Restaurants'}

In [6]:
# Build a scraper to extract links for all pages
def page_scraper(url=None,page_no=None,sleep=3):
    """
    Scrape a Yelp url.

    Args:
        url (str): string of Yelp url.

    Returns:
        set: containing all relevant restaurant links.
    """
    # Download the webpage
    page = requests.get(url)

    # If a connection was reached
    if page.status_code == 200:

        # Parse
        soup = BeautifulSoup(page.content,'html.parser')

        # Extract page urls
        urls = set([url])
        for i in range(10, page_no*10, 10):
            urls.update([url+'&start='+str(i)])

    # Extract all relevant links
    links = set()

    for url in urls:

        # Keep track of where we are at
        print(url)

        # Download the webpage
        page = requests.get(url)

        # If a connection was reached
        if page.status_code == 200:

                # Parse
                soup = BeautifulSoup(page.content,'html.parser')

                for tag in soup.select('span > a'):
                    href = tag.attrs.get('href')
                    if '?osq=Restaurants' in href and 'https:' not in href:
                        links.update(['https://www.yelp.com'+href])

        # Put the system to sleep for a random draw of time
        time.sleep(random.uniform(0,sleep))

    # Return data
    return links

In [7]:
# Scrape Yelp DC page
DC_links = page_scraper(url=DC_url,page_no=24)

https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=170
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=60
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=140
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=40
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=110
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=190
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=200
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=160
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=150
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=10
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=50
https://www.yelp.com/search?find_des

In [9]:
# View DC_links
DC_links

{'https://www.yelp.com/biz/14th-st-cafe-asian-bistro-washington-2?osq=Restaurants',
 'https://www.yelp.com/biz/1914-by-kolben-washington?osq=Restaurants',
 'https://www.yelp.com/biz/801-restaurant-and-bar-washington?osq=Restaurants',
 'https://www.yelp.com/biz/a-baked-joint-washington-9?osq=Restaurants',
 'https://www.yelp.com/biz/agora-washington-4?osq=Restaurants',
 'https://www.yelp.com/biz/akira-ramen-and-izakaya-dc-washington-2?osq=Restaurants',
 'https://www.yelp.com/biz/al-volo-trattoria-and-bar-washington?osq=Restaurants',
 'https://www.yelp.com/biz/albi-washington?osq=Restaurants',
 'https://www.yelp.com/biz/ambar-arlington-4?osq=Restaurants',
 'https://www.yelp.com/biz/ambar-washington?osq=Restaurants',
 'https://www.yelp.com/biz/anafre-washington-2?osq=Restaurants',
 'https://www.yelp.com/biz/andys-pizza-washington?osq=Restaurants',
 'https://www.yelp.com/biz/anju-washington?osq=Restaurants',
 'https://www.yelp.com/biz/ankara-washington?osq=Restaurants',
 'https://www.yelp.c

In [10]:
# Check number of DC_links
len(DC_links)

240