In [1]:
# Data processing
import pandas as pd
import country_converter as coco

# Scraping web content
import requests # For downloading the website
from bs4 import BeautifulSoup # For parsing the website
import time # To put the system to sleep
import random # For random numbers

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Yelp DC restaurants url
DC_url = 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC'

# Download the webpage
DC_page = requests.get(DC_url)
DC_page.status_code # 200 == Connection

200

In [3]:
# Parse the content
DC_soup = BeautifulSoup(DC_page.content,'html.parser')

In [4]:
# Extract page urls
DC_urls = set([DC_url])
DC_page_no = 24

for i in range(10, DC_page_no*10, 10):
    DC_urls.update([DC_url+'&start='+str(i)])

DC_urls

{'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=10',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=100',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=110',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=120',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=130',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=140',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=150',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=160',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=170',
 'https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=180',

In [5]:
# Extract relevant links for one page
links = set()

for tag in DC_soup.select('span > a'):
    href = tag.attrs.get('href')
    if '?osq=Restaurants' in href and 'https:' not in href:
        links.update(['https://www.yelp.com'+href])

links

{'https://www.yelp.com/biz/cane-washington-3?osq=Restaurants',
 'https://www.yelp.com/biz/gypsy-kitchen-washington?osq=Restaurants',
 'https://www.yelp.com/biz/il-canale-washington-2?osq=Restaurants',
 'https://www.yelp.com/biz/le-diplomate-washington?osq=Restaurants',
 'https://www.yelp.com/biz/maydan-washington-2?osq=Restaurants',
 'https://www.yelp.com/biz/mr-braxton-bar-and-kitchen-washington?osq=Restaurants',
 'https://www.yelp.com/biz/roses-luxury-washington?osq=Restaurants',
 'https://www.yelp.com/biz/supreme-hot-pot-arlington-2?osq=Restaurants',
 'https://www.yelp.com/biz/the-alibi-washington?osq=Restaurants',
 'https://www.yelp.com/biz/the-block-washington?osq=Restaurants'}

In [6]:
# Build a scraper to extract relevant links
def page_scraper(urls=None,sleep=3):
    """
    Scrape a Yelp url.

    Args:
        urls (list): list of Yelp urls.
        sleep (int): integer value specifying how long the machine should be put to sleep (random uniform); defaults to 3.

    Returns:
        set: set containing all relevant restaurant links.
    """
    links = set()

    for url in urls:

        # Keep track of where we are at
        print(url)

        # Download the webpage
        page = requests.get(url)

        # If a connection was reached
        if page.status_code == 200:

                # Parse
                soup = BeautifulSoup(page.content,'html.parser')

                for tag in soup.select('span > a'):
                    href = tag.attrs.get('href')
                    if '?osq=Restaurants' in href and 'https:' not in href:
                        links.update(['https://www.yelp.com'+href])

        # Put the system to sleep for a random draw of time
        time.sleep(random.uniform(0,sleep))

    # Return data
    return links

In [7]:
# Scrape Yelp DC page
DC_links = page_scraper(urls=DC_urls)

https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=10
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=120
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=100
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=40
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=50
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=140
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=70
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=210
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=130
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=230
https://www.yelp.com/search?find_desc=Restaurants&find_loc=Washington%2C%20DC&start=160
https://www.yelp.com/search?find_des

In [8]:
# View DC_links
DC_links

{'https://www.yelp.com/biz/14th-st-cafe-asian-bistro-washington-2?osq=Restaurants',
 'https://www.yelp.com/biz/1914-by-kolben-washington?osq=Restaurants',
 'https://www.yelp.com/biz/2-amys-neapolitan-pizzeria-washington?osq=Restaurants',
 'https://www.yelp.com/biz/agora-washington-4?osq=Restaurants',
 'https://www.yelp.com/biz/akira-ramen-and-izakaya-dc-washington-2?osq=Restaurants',
 'https://www.yelp.com/biz/albi-washington?osq=Restaurants',
 'https://www.yelp.com/biz/ambar-arlington-4?osq=Restaurants',
 'https://www.yelp.com/biz/ambar-washington?osq=Restaurants',
 'https://www.yelp.com/biz/anafre-washington-2?osq=Restaurants',
 'https://www.yelp.com/biz/andys-pizza-washington?osq=Restaurants',
 'https://www.yelp.com/biz/anju-washington?osq=Restaurants',
 'https://www.yelp.com/biz/ankara-washington?osq=Restaurants',
 'https://www.yelp.com/biz/anxo-cidery-and-pintxos-bar-washington?osq=Restaurants',
 'https://www.yelp.com/biz/astoria-dc-washington?osq=Restaurants',
 'https://www.yelp.

In [9]:
# Check number of DC_links
len(DC_links)

240

In [14]:
# Create an empty DataFrame
Yelp_DC = pd.DataFrame(columns=['business_name','rating',
                                'review_count','price_category'])

for tag in DC_soup.select('[class*=container]'):
    if tag.find('h4'):
        Yelp_DC = Yelp_DC.append({'business_name':tag.select('h4')[0].get_text(),
                                  'rating':tag.select('[aria-label*=rating]')[0]['aria-label'],
                                  'review_count':tag.select('[class*=reviewCount]')[0].get_text(),
                                  'price_category':tag.select('[class*=priceCategory]')[0].get_text()},
                                 ignore_index=True)

Yelp_DC

Unnamed: 0,business_name,rating,review_count,price_category
0,Troys Italian Kitchen,3.5 star rating,99,"$$Pizza, Vegan, Italian"
1,1. Le Diplomate,4.5 star rating,3596,"$$$Brasseries, French, Breakfast & Brunch"
2,2. Gypsy Kitchen,4.5 star rating,70,"Tapas/Small Plates, Mediterranean"
3,3. The Block,5 star rating,12,"Food Court, Bars, Asian Fusion"
4,4. Supreme Hot Pot,5 star rating,15,"Hot Pot, Seafood, Kebab"
5,5. il Canale,4.5 star rating,3238,"$$Italian, Pizza"
6,6. Rose’s Luxury,4.5 star rating,1969,"$$$American (New), Italian, Tapas/Small Plates"
7,7. Maydan,4.5 star rating,826,"$$Moroccan, Lebanese"
8,8. Cane,4.5 star rating,277,$$Caribbean
9,9. Mr Braxton Bar & Kitchen,4.5 star rating,198,"$$American (New), Cocktail Bars"
