### Master Q2 2016 DCA List of Brands to Scrape
https://docs.google.com/spreadsheets/d/15leb475qGWrnCGqoHy78FQbe4z84B79krPpeH5IZLqI/edit

### Final Google Sheet
https://docs.google.com/spreadsheets/d/17Z6N6zJaBRVVcphy6hGTG_o7g_Q_SiCSziEYoldQsYs/edit

# Prerequisites

In [1]:
# Import requests library for opening webpages
import requests

# Import the BeautifulSoup object from the bs4 library
# The object can be used directly (i.e. BeautifulSoup()); no need for bs4. prefix
from bs4 import BeautifulSoup

# Import the pandas library for data analysis and assign it an alias of "pd"
import pandas as pd

# Import Python's built-in datetime module and assign it an alias of "dt"
import datetime as dt

# Get all State Page URLS

In [8]:
# Print start time -- optional
print("Start Time : " + str(dt.datetime.now()))

# Create empty Python list to store all City URL pages from the main Locations page
CityURLS = []

# Use request library's .get() method to open the URL, then collect its HTML (.text)
# Feed the HTML into a BeautifulSoup object and assign it 
HTML = requests.get("https://locations.dennys.com/").text
soup = BeautifulSoup(HTML, "lxml")

# Find the first "div" HTML element with a class of "c-directory-list"
container = soup.find("div", {"class" : "c-directory-list"})

# Within this div element, find all hyperlinks (HTML element of "a")
links = container.findAll("a")

# Loop through each "a" tag (i.e. each hyperlink)
for link in links:
    # Get the URL (the "href" attribute), concatenate it to main Denny's URL
    # Take the complete URL and append it to the CityURLS list
    CityURLS.append("https://locations.dennys.com/" + link.get("href"))
    
# Print end time
print("End Time   : " + str(dt.datetime.now()))

# Preview the first 5 results of the CityURLS list
CityURLS[:5]

Start Time : 2016-05-16 16:24:58.684597
End Time   : 2016-05-16 16:24:58.756382


['https://locations.dennys.com/AL',
 'https://locations.dennys.com/AK',
 'https://locations.dennys.com/AZ',
 'https://locations.dennys.com/AR',
 'https://locations.dennys.com/CA']

# Get All URLS within the State Pages

In [10]:
# Print start time
print("Start Time : " + str(dt.datetime.now()))

# Create empty Python list to store all URL pages from each state page
# These can either be a restaurant page or a city page -- we'll control for that in next cell
InnerURLS = []

# Loop through each CityURL
for URL in CityURLS:
    
    # Open each URL with requests, feed the HTML into a BeautifulSoup object
    HTML = requests.get(URL).text
    soup = BeautifulSoup(HTML, "lxml")
    
    # Attempt to find the "div" element with a class of "c-directory-list",
    # then find all hyperlinks ("a" tags) within the element
    # If an error is encountered, the code in the "except" clause below will move to the next URL
    try:
        container = soup.find("div", {"class" : "c-directory-list"})
        links = container.findAll("a")
    except:
        continue
    
    # If no error is encountered in the try: statement
    for link in links:
        # Loop through the links and add their URLS to the RestaurantURLS list
        InnerURLS.append("https://locations.dennys.com/" + link.get("href"))
        
# Three of the state links from the primary Locations page are specific restaurant pages
# This happens when a state only has 1 Denny's location
# Manually append these three restaurant URLS to the end of our list
InnerURLS.append("https://locations.dennys.com/DE/NEWARK/248777")
InnerURLS.append("https://locations.dennys.com/DC/WASHINGTON/248646")
InnerURLS.append("https://locations.dennys.com/DC/WASHINGTON/247145")

# Print end time
print("End Time   : " + str(dt.datetime.now()))

# Preview the first 5 results of the CityURLS list
InnerURLS[:5]

Start Time : 2016-05-16 16:37:50.551641
End Time   : 2016-05-16 16:38:07.842669


['https://locations.dennys.com/AL/BIRMINGHAM/248621',
 'https://locations.dennys.com/AL/CULLMAN/248546',
 'https://locations.dennys.com/AL/DOTHAN/209311',
 'https://locations.dennys.com/AL/HOPE-HULL--TYSON-/209310',
 'https://locations.dennys.com/AL/HUNTSVILLE/248545']

# Run Through Restaurant List -- Add to One of Two New Lists

In [13]:
# Create two empty Python lists --- one will store restaurant URLS
# The other will store any URLS that are not restaurant pages (these are city pages)
RestaurantURLS = []
ExtraURLS = []

# For each URL in the RestaurantURLS list (which includes both restaurant and city pages)
for URL in InnerURLS:
    
    # Find the numeric location of the URL in the RestaurantURLS list
    # If the location is evenly divisible by 100, print the location, the URL, and the current time
    location = InnerURLS.index(URL)
    if location % 100 == 0:
        print location, URL, dt.datetime.now()
    
    # Open the URL and feed the HTML content to BeautifulSoup
    HTML = requests.get(URL).text
    soup = BeautifulSoup(HTML, "lxml")
    
    try:
        # If the h1 element can be found, it's a restaurant page
        # Add it to the FinalRestaurantsURL list
        Name = soup.find("h1", {"itemprop": "name"}).text.strip()
        RestaurantURLS.append(URL)
    except:
        # Otherwise, if an error is encountered, it's a city page
        # Add it to the ExtraURLS list
        ExtraURLS.append(URL)
        continue

0 https://locations.dennys.com/AL/BIRMINGHAM/248621 2016-05-16 16:43:57.436631
100 https://locations.dennys.com/CA/CLAREMONT/247811 2016-05-16 16:44:06.132505
200 https://locations.dennys.com/CA/NO-HOLLYWOOD/200164 2016-05-16 16:44:14.200785
300 https://locations.dennys.com/CA/TWENTYNINE-PALMS/247164 2016-05-16 16:44:21.675529
400 https://locations.dennys.com/FL/MIAMI-GARDENS/248145 2016-05-16 16:44:29.254064
500 https://locations.dennys.com/IL/HANOVER-PARK/247623 2016-05-16 16:44:36.702565
600 https://locations.dennys.com/MD/FREDERICK/201848 2016-05-16 16:44:44.910933
700 https://locations.dennys.com/NE/OMAHA/246635 2016-05-16 16:44:52.798268
800 https://locations.dennys.com/NC/FAYETTEVILLE/248801 2016-05-16 16:45:00.733432
900 https://locations.dennys.com/PA/CLIFTON-HEIGHTS/248776 2016-05-16 16:45:08.904494
1000 https://locations.dennys.com/TX/KATY 2016-05-16 16:45:16.856703
1100 https://locations.dennys.com/VA/RICHMOND 2016-05-16 16:45:24.532610


# Capture Extra Restaurant URLS from City Pages

In [14]:
# Every extra URL that failed in the previous cell is a city page
# For each of these extra URLS,
for URL in ExtraURLS:
    
    # Open each of the ExtraURLS and parse the HTML
    HTML = requests.get(URL).text
    soup = BeautifulSoup(HTML, "lxml")
    
    # Find the "div" element with a class of "c-location-grid-item-link-wrapper"
    # This is the container of each restaurant page
    restaurants = soup.findAll("div", {"class" : "c-location-grid-item-link-wrapper"})
    
    # Get the URLS of each restaurant within the container
    # Add each URL to the RestaurantsURL list we created earlier
    # This list already contains the restaurant pages from the previous cell
    # The [3:] removes extra characters (backslashes) from the URL
    for x in restaurants:
        RestaurantURLS.append("https://locations.dennys.com/" + x.find("a").get("href")[3:])    

# Create Empty DataFrame

In [15]:
# Create Python list of strings that will serve as DataFrame column names
columns = ["Name", "Address", "City", "State", "Zipcode", "Phone", "URL"]

# Initialize DataFrame, feed the "columns" list to the columns parameter
df = pd.DataFrame(columns = columns)

# Preview empty DataFrame
df

Unnamed: 0,Name,Address,City,State,Zipcode,Phone,URL


# Number of Restaurants

In [16]:
len(RestaurantURLS)

1585

# The Scrape

In [17]:
# For each of the 1585 URLS in the RestaurantURLS list
for URL in RestaurantURLS:
    
    # Do the same test to print the list position, URL, and time for every 100th URL
    location = RestaurantURLS.index(URL)
    if location % 100 == 0:
        print location, URL, dt.datetime.now()
    
    # Open and parse the page
    HTML = requests.get(URL).text
    soup = BeautifulSoup(HTML, "lxml")
    
    # Find the HTML elements on the page that contain the data bits we're looking for
    # Use the .text attribute of each element to get the element's contents
    # Use Python's built-in .strip() string function to remove extra white space
    # Store each data point in its own variable (Name, Address, City etc.)
    Name      = soup.find("h1", {"itemprop": "name"}).text.strip()
    Address   = soup.find("span", {"itemprop" : "streetAddress"}).text.strip()
    City      = soup.find("span", {"itemprop" : "addressLocality"}).text.strip()
    State     = soup.find("span", {"itemprop" : "addressRegion"}).text.strip()
    ZipCode   = soup.find("span", {"itemprop" : "postalCode"}).text.strip()
    Telephone = soup.find("span", {"itemprop" : "telephone"}).text.strip()
    
    # Package the DataFrame header names and the values for each URL
    # in a Python dictionary object (stored in variable also called "dictionary")
    # The keys are the DataFrame column names
    # The values are the variable that are storing the values
    dictionary = {"Name" : Name, "Address" : Address, 
                  "City" : City, "State" : State, "Zipcode" : ZipCode, 
                  "Phone" : Telephone, "URL" : URL}
    
    # Append the dictionary to the end of the "df" DataFrame
    df = df.append(dictionary, ignore_index = True)

0 https://locations.dennys.com/AL/BIRMINGHAM/248621 2016-05-16 16:49:11.162051
100 https://locations.dennys.com/CA/EUREKA/247449 2016-05-16 16:49:22.488496
200 https://locations.dennys.com/CA/SAN-FERNANDO/247847 2016-05-16 16:49:33.007418
300 https://locations.dennys.com/FL/HIALEAH-GARDENS/247302 2016-05-16 16:49:43.914673
400 https://locations.dennys.com/IL/GLEN-CARBON/247220 2016-05-16 16:49:53.650213
500 https://locations.dennys.com/MD/NORTH-EAST/248614 2016-05-16 16:50:04.711588
600 https://locations.dennys.com/NJ/CLEMENTON/248813 2016-05-16 16:50:15.373411
700 https://locations.dennys.com/OH/BOARDMAN-TWP/247402 2016-05-16 16:50:27.211383
800 https://locations.dennys.com/SC/NORTH-CHARLESTON/247987 2016-05-16 16:50:39.984877
900 https://locations.dennys.com/UT/LOGAN/249268 2016-05-16 16:50:51.453567
1000 https://locations.dennys.com/AZ/FLAGSTAFF/247973 2016-05-16 16:51:05.317046
1100 https://locations.dennys.com/CA/FRESNO/247815 2016-05-16 16:51:19.515351
1200 https://locations.denn

# Data Cleanup

In [18]:
# Remove duplicates from DataFrame (just in case), alter it in place
df.drop_duplicates(inplace = True)

# Sort by state, then city, then name (Alphabetical for each)
df.sort(["State", "City", "Name"], inplace = True)

In [None]:
# Export to a CSV file, do not include the index to the left of DataFrame
# The encoding parameter avoids common issues with foreign characters
df.to_csv("Q2.16 DCA Scrape.csv", index = False, encoding = "utf-8")