### National Park Service Website Scraper & NLP Analysis
### Code by Eric Englin
<br><br>
#### Objective: scrape all 395 websites for NPS to see if they meet the 10 essential travelers information for transportation
#### These 10 are:
<li>driving directions</li>
<li>Public transportation information</li>
<li>Bike and pedestrian information</li>
<li>Parking lot locations and accommodations</li>
<li>Parking lot peak use and availability</li>
<li>congestion information</li>
<li>travel distances and travel time to sites within the park</li>
<li>Accessibility</li>
<li>Description of transportation experience</li>
<li>Alternative fueling stations</li>

<br><br>
#### Together, these 10 measures can allow NPS to understand how their parks are providing transportation to visitors. This information can be used to evaluate each park and plan for an improved park experience in the future. 

In [2]:
# import libraries
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns 
import html
from bs4 import BeautifulSoup
import requests
import os
from selenium import webdriver
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import warnings; warnings.simplefilter('ignore')



In [3]:
#change to the location where you saved chromedriver
#if issues here, make sure that we have correct chromedriver installed version:
        # (check google chrome version -- It'll be somewhere between 73-79)
chromedriver_location=r'C:\Users\eric.englin\Downloads/chromedriver.exe'

In [4]:
driver = webdriver.Chrome(executable_path=chromedriver_location) 
driver.get('https://www.nps.gov/AGFO/planyourvisit/directions.htm')
driver.close() #close driver link

In [5]:
path = "Park Unit Scraping Information.csv"
parks = pd.read_csv(path, encoding='latin-1')


In [6]:
len(parks)

393

In [7]:
index = []
for x in parks['Alpha']:
    y = "https://www.nps.gov/"+x+"/index.htm"
    index.append(y)

parks['index site']=index

In [8]:
## For context, here is the main site for a sample of national parks
y=0
for x in parks['index site']:
    y+=1
    if y%25 ==0:
        print(x)

https://www.nps.gov/BEOL/index.htm
https://www.nps.gov/CAHA/index.htm
https://www.nps.gov/CHPI/index.htm
https://www.nps.gov/DEWA/index.htm
https://www.nps.gov/FOLS/index.htm
https://www.nps.gov/GATE/index.htm
https://www.nps.gov/GWMP/index.htm
https://www.nps.gov/JELA/index.htm
https://www.nps.gov/LAVO/index.htm
https://www.nps.gov/MLKM/index.htm
https://www.nps.gov/NWWM/index.htm
https://www.nps.gov/POPO/index.htm
https://www.nps.gov/SAJU/index.htm
https://www.nps.gov/THRO/index.htm
https://www.nps.gov/WEFA/index.htm


In [8]:
def scrape_site(park):
    from selenium.common.exceptions import InvalidArgumentException

    driver = webdriver.Chrome(executable_path=chromedriver_location) #change location
    link = "https://www.nps.gov/"+park+"/index.htm"
    driver.get(link)
    #driver.find_element_by_xpath('//*[@id="anch_15"]').click()
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'lxml')
    website_list=[]
    raw_list=[]
    website_content=[]
    for l in soup.find_all('a'):
        try:
            if "planyourvisit" in l.get('href') and l.get('href') not in raw_list: #only want plan your visit sites
                if "https://www.nps.gov" in l.get('href'):
                    z= l.get('href')
                    raw_list.append(z)
                    website_list.append(z)
                else:
                    z = l.get('href')
                    raw_list.append(z)
                    z = "https://www.nps.gov"+z
                    website_list.append(z)
        except:
            pass
    for x in website_list:
        try:
            driver.get(x)
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'lxml')
            raw_content = soup.get_text(strip=True) #all text fields are scraped 
            website_content.append(raw_content) #raw content added to list of all content
        except:
            #This means that the webpage doesn't exist
            pass
    driver.close() #close driver link at end of scrape
    dict = {'website page': website_list, 'content': website_content}  #create dataframe for park data
    park_data = pd.DataFrame(dict) 
    park_data['park']=park
    return park_data

In [9]:
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", 
             "you", "your", "yours", "yourself", "yourselves", "he", "him", 
             "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", 
             "they", "them", "their", "theirs", "themselves", "what", "which", "who",
             "whom", "this", "that", "these", "those", "am", "is", "are", "was", 
             "were", "be", "been", "being", "have", "has", "had", "having", "do", 
             "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", 
             "because", "as", "until", "while", "of", "at", "by", "for", "with", "about",
             "against", "between", "into", "through", "during", "before", "after", "above", 
             "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", 
             "again", "further", "then", "once", "here", "there", "when", "where", "why", 
             "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", 
             "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", 
             "s", "t", "can", "will", "just", "don", "should", "now",
            "href","=","/",">","<","]","[","span","'\n'",'class',"jstcache",
            "onclick","null","jscontent"," <br/>","</span>",",",";","(",")","{","}",":","''",
            "&","'","var","+=",".","#","-","=","+","``","0","’","data.operatingHours","outputVarOperatingHours",
            ".exceptions","--","1","-1","?","class=","==","div","/div","$","li","e","!","k","/span","jQuery",
            "tabindex",'j','l']

In [10]:
def Traveler_Info_Finder(park):
    """
    Find the following fields:
    #Public transportation information
    #Alternative Fueling Stations
    #Bike/Pedestrian Information
    #Driving directions
    """

    
    AFS_list = []
    Bike_Ped_count = []
    Directions_count = []
    Directions_page_count = []
    Pub_Transit_count = []
    Direction_majorcount = []
    Direction_count = []
    Congestion_count = []
    Travel_dist_count = []
    Travel_dist_other_count=[]
    Accessibility_count=[]
    Parking_count=[]
    Parking_plan_count=[]
    

    Directions_Words = ["Entrance","Center","street","Visitor"
                        "Street","parking","directions","Route","Road",
                        "Interstate","Exit",
                        "mile","km","ferry","access", "Street","Blvd", "Hwy"
                       ]

    Directions_MajorWords = [
        "GPS Coordinates", "GPS coordinates", "GPS device address", "GPS address",
        "Latitude","Longitude","Street",
        "Blvd", "Boulevard", "Ln.","Rd.","Pl.",
        "Hwy","Exit","Interstate","US Highway", "U.S. Highway", "Indian Head Highway",  "Turnpike","beltway","Causeway"
        "Secondary Route", "State Route", "I-","State Highway"
        ]


    Public_Transit_Words = [
        "Public Transportation", "public transportation","Public transportation",
        "bus schedule", "Bus schedule", "shuttle", "shuttles","Shuttle",
        "bus stops", "buses stop", "ferry","transit","Transit"
    ]

    Congestion_Words = [
       "congestion","Congestion", "congested"
    ]
    
    BicyclePed_Words = [
        "Bicyclists","bicyclists","cyclists","pedestrians","biking"
        #,"biking","Biking"
    ]
    
    Travel_dist_Words = [
        'miles'
    ]

    Travel_dist_other_Words = [
        'Places To Go',"Popular Destinations"
    ]
    
    Accessibility_Words = [
        "wheelchair", "accessibility", "disability", "impaired", "disabilities", "handicap",
        "accessible","Wheelchair"
    ]
    
    Parking_Words = [
        "parking", "Parking", "pullout"
    ]

    
    #this will get the number of sites that have keywords
    count=0
    for x in park["content"]:
        try:
            y=0
            if "Department of Energy" in x and "Alternative Fueling Station" in x:
                y=1
                AFS_list.append(y)
            else:
                y=0
                AFS_list.append(y)
            if any(substring in x for substring in Public_Transit_Words):
                y=1
                Pub_Transit_count.append(y)
            else:
                y=0
                Pub_Transit_count.append(y)
            if any(substring in x for substring in Directions_MajorWords):
                y=1
                Direction_majorcount.append(y)
            else:
                y=0
                Direction_majorcount.append(y)
            if any(substring in x for substring in BicyclePed_Words):
                y=1
                Bike_Ped_count.append(y)
            else:
                y=0
                Bike_Ped_count.append(y)
            if any(substring in x for substring in Congestion_Words):
                y=1
                Congestion_count.append(y)
            else:
                y=0
                Congestion_count.append(y)
            if any(substring in x for substring in Travel_dist_Words):
                y=1
                Travel_dist_count.append(y)
            else:
                y=0
                Travel_dist_count.append(y)
            if any(substring in x for substring in Travel_dist_other_Words):
                y=1
                Travel_dist_other_count.append(y)
            else:
                y=0
                Travel_dist_other_count.append(y)
            if any(substring in x for substring in Parking_Words):
                y=1
                Parking_count.append(y)
            else:
                y=0
                Parking_count.append(y)           
            if any(substring in x for substring in Directions_Words):
                y=1
                Directions_page_count.append(y)
            else:
                y=0
                Directions_page_count.append(y)           
        except:
            y=0
            AFS_list.append(y)
            Pub_Transit_count.append(y)
            Direction_majorcount.append(y)
            Bike_Ped_count.append(y)
            Congestion_count.append(y)
            Travel_dist_count.append(y)
            Travel_dist_other_count.append(y)
            Parking_count.append(y)
            Directions_page_count.append(y)

                

                
#this section will get the total number of times that keywords show up on all sites for a park
    ps = PorterStemmer()
    lem = WordNetLemmatizer()

    stemmed_words=[]

    for x in park['content']:
        z=0
        z2=0
        z3=0
        major = 0
        congestion = 0
        pubtrans=0
        bikeped=0
        try:
            tokenized_word=word_tokenize(x)
            filtered_sent=[]
            stemmed_words=[]
            direction_words_temp = []
            for w in tokenized_word:
                if w not in stopwords:
                    filtered_sent.append(w)
            for w in filtered_sent:
                if w in Directions_Words:
                    z += 1
                if w in Parking_Words:
                    z2 +=1
                if w in Accessibility_Words:
                    z3 += 1
            Direction_count.append(z)
            Parking_plan_count.append(z2)
            Accessibility_count.append(z3)
        except:
            Direction_count.append(0)
            Parking_plan_count.append(0)
            Accessibility_count.append(0)
            
    park["Alternative_Fueling_Stations"]=AFS_list
    park["MajorDirections_count"]=Direction_majorcount
    park["Directions_count"]=Direction_count
    park["Directions_page_count"]=Directions_page_count
    park["Public_transportation_information"]=Pub_Transit_count
    park["Congestion_information"]=Congestion_count
    park["Bike_Pedestrian_Information"]=Bike_Ped_count
    park["Travel_dist_information"]=Travel_dist_count
    park["Travel_other_dist_information"]=Travel_dist_other_count
    park['Accessibility_intro_information']=Accessibility_count
    park['Parking_information']=Parking_count
    park['Parking_experience_information']=Parking_plan_count
    park['Parking_max_on_one_site']=park['Parking_experience_information']
    

    park['Accessibility_information']=np.where(
        np.logical_or(park['Accessibility_intro_information']>2, 
                     park['Parking_experience_information']>2),1,0)

    park_final = park.groupby('park', as_index=False).agg({
        "MajorDirections_count": "sum",
        "Directions_count": "sum",
        "Directions_page_count":"sum",
        "Public_transportation_information": "sum",
        "Alternative_Fueling_Stations":"sum",
        "Bike_Pedestrian_Information":"sum",
        'Congestion_information':'sum',
        'Travel_dist_information':'sum',
        'Travel_other_dist_information':'sum',
        'Accessibility_information':'sum',
        'Parking_information':'sum',
        'Parking_experience_information':'sum',
        'Parking_max_on_one_site':'max',
        "website page":"count",
    })
    
   # park_final2 = park.groupby('park')['Directions_word_list'].apply(lambda x: ','.join(x))
   # park_final = park_final.merge(park_final2, on="park")

    return park_final


In [11]:
z=0
v=0
for x in parks['Alpha'].unique():
    v+=1
    if z==0:
        park_scrape_dataset=scrape_site(x)
    else:
        this_park_scrape = scrape_site(x)
        park_scrape_dataset = park_scrape_dataset.append(this_park_scrape)
    if v % 25 == 0:
        print(x)
        print(len(park_scrape_dataset))
    z+=1
    if z>50: #if want to test out
        break

BEOL
724
CAHA
1660


In [12]:
# save your scraped website into another variable name so don't have to redo scrape
park_scrape_dataset2 = park_scrape_dataset

#data cleaning
park_scrape_dataset2['index1'] = park_scrape_dataset2.index
park_scrape_dataset2=park_scrape_dataset2.reset_index()


#save as excel
#note: saving as a csv won't work due to punctuation used in html code
park_scrape_dataset2.to_excel("full_park_scrape_dataset2.xlsx")

In [13]:
park_scrape_dataset2.tail()

Unnamed: 0,index,website page,content,park,index1
1689,29,https://www.nps.gov/planyourvisit/index.htm,Plan Your Visit (U.S. National Park Service)va...,CAKR,29
1690,30,https://www.nps.gov/planyourvisit/event-search...,Event Calendar (U.S. National Park Service)var...,CAKR,30
1691,31,https://www.nps.gov/planyourvisit/passes.htm,America the Beautiful Passes (U.S. National Pa...,CAKR,31
1692,32,https://www.nps.gov/planyourvisit/trip-ideas.htm,Trip Ideas (U.S. National Park Service)var jsD...,CAKR,32
1693,33,https://www.nps.gov/cakr/planyourvisit/permits...,NPS - Page In-ProgressPage In-ProgressThis pag...,CAKR,33


In [15]:
## Model to calculate VE fields ##

#create new sheet so with our variables for each park
park_sheet = pd.DataFrame(columns = ['park', 'Driving_Directions','Public_transportation_information',
                                     'Bike_Pedestrian_Information','Congestion_information','Accessibility',
                                           'Alternative_Fueling_Stations', 'website page count'])
z=0
tic = time.clock() #function to let us track processing time


for x in park_scrape_dataset2['park'].unique():
    z+=1
    if z % 25 == 0: 
        #function to let us track processing time
        z5 = 400-z
        toc = time.clock()
        time_diff = toc-tic
        print("Current Park: ", x, ": ", z, " checks done; ", z5, " remaining; Processing Time: ",time_diff)
        tic=toc
        
    this_park = park_scrape_dataset[(park_scrape_dataset['park']==x)] #filter our webscraping dataset for our park's website code
    park_final = Traveler_Info_Finder(this_park) #run function
    park_sheet = park_sheet.append({'park': park_final.get_value(0,'park'),
                        'website page count': park_final.get_value(0,'website page'),
                        'Directions_word_count': park_final.get_value(0,'Directions_count'),
                        'Directions_page_count': park_final.get_value(0,'Directions_page_count'),
                       'Driving_Directions': park_final.get_value(0,'MajorDirections_count'),
                       'Public_transportation_information': park_final.get_value(0,'Public_transportation_information'),
                       'Alternative_Fueling_Stations': park_final.get_value(0,'Alternative_Fueling_Stations'), 
                       'Bike_Pedestrian_Information': park_final.get_value(0,'Bike_Pedestrian_Information'),
                       'Congestion_information': park_final.get_value(0,'Congestion_information'),
                        'Travel_Distance_Information': park_final.get_value(0,'Travel_dist_information'),
                        'Travel_other_dist_information': park_final.get_value(0,'Travel_other_dist_information'),
                        'Accessibility': park_final.get_value(0,'Accessibility_information'),
                        'Parking_raw_information': park_final.get_value(0,'Parking_information'),
                        'Parking_experience_information': park_final.get_value(0,'Parking_experience_information'),
                        'Parking_max_on_one_site': park_final.get_value(0,'Parking_max_on_one_site')
                       },
                      ignore_index=True)
    park_sheet.loc[park_sheet.Driving_Directions > 0, 'Driving_Directions'] = 1
    park_sheet.loc[park_sheet.Alternative_Fueling_Stations > 0, 'Alternative_Fueling_Stations'] = 1
    park_sheet.loc[park_sheet.Public_transportation_information > 0, 'Public_transportation_information'] = 1
    park_sheet.loc[park_sheet.Bike_Pedestrian_Information > 0, 'Bike_Pedestrian_Information'] = 1
    park_sheet.loc[park_sheet.Congestion_information > 0, 'Congestion_information'] = 1
    park_sheet.loc[park_sheet.Accessibility > 0, 'Accessibility'] = 1
 #   park_sheet.loc[park_sheet.Parking_information > 0, 'Parking_information'] = 1
    park_sheet['Travel_Distance_Final']=np.where(
        np.logical_or(park_sheet['Travel_Distance_Information']>9, 
                     park_sheet['Travel_other_dist_information']>0),1,0)
    park_sheet['Parking_Experience_information']=np.where((
        park_sheet['Parking_raw_information']/park_sheet['website page count'])>0.25,1,0)
    park_sheet['Transportation_experience_information']=np.where((
        park_sheet['Directions_page_count']/park_sheet['website page count'])>0.65,1,0)
    park_sheet['Parking_information']=np.where(np.logical_or(
        park_sheet['Parking_Experience_information']==1,
        park_sheet['Parking_max_on_one_site']>2),1,0)


park_sheet= park_sheet.drop(columns=['website page count', 'Directions_word_count',
                        'Directions_page_count','Parking_raw_information','Parking_experience_information',
                        'Parking_max_on_one_site','Travel_Distance_Information','Travel_other_dist_information'])
    
#create csv
park_sheet.to_csv("final_park2.csv") #save final csv
os.system("start EXCEL.EXE final_park2.csv") #open csv file

Current Park:  BEOL :  25  checks done;  375  remaining; Processing Time:  30.670907299999726
Current Park:  CAHA :  50  checks done;  350  remaining; Processing Time:  33.50025940000023


0

In [16]:
## In progress cell ##
## Could use lat/longs to immediately make a map to visualize the output data

import urllib.request, json

# Configure API request
park = "acad"
endpoint = "https://developer.nps.gov/api/v1/parks?parkCode=" + park + "&api_key=g6HPfqqNODFIg64arFX7CPrRwSxoFci42Wu54IZ7"
HEADERS = {"Authorization":"g6HPfqqNODFIg64arFX7CPrRwSxoFci42Wu54IZ7"}
req = urllib.request.Request(endpoint,headers=HEADERS)

# Execute request and parse response
response = urllib.request.urlopen(req).read()
data = json.loads(response.decode('utf-8'))

# Prepare and execute output
print(data["data"][0]["fullName"] + " can be found at " + data["data"][0]["latLong"] + ".")

Acadia National Park can be found at lat:44.30777545, long:-68.30063316.


In [18]:
park_data = pd.read_excel("full_park_scrape_dataset2.xlsx")

In [22]:
park_data.head()

Unnamed: 0.1,Unnamed: 0,index,website page,content,park,index1
0,0,0,https://www.nps.gov/abli/planyourvisit/index.htm,Plan Your Visit - Abraham Lincoln Birthplace N...,ABLI,0
1,1,1,https://www.nps.gov/abli/planyourvisit/basicin...,Basic Information - Abraham Lincoln Birthplace...,ABLI,1
2,2,2,https://www.nps.gov/abli/planyourvisit/hours.htm,Operating Hours & Seasons for 2019 - Abraham L...,ABLI,2
3,3,3,https://www.nps.gov/abli/planyourvisit/fees.htm,Fees & Passes - Abraham Lincoln Birthplace Nat...,ABLI,3
4,4,4,https://www.nps.gov/abli/planyourvisit/conditi...,Alerts & Conditions - Abraham Lincoln Birthpla...,ABLI,4


In [20]:
# About 7% of all sites were down at the time of data pull
error_flag = []
for x in park_scrape_dataset2["content"]:
    y=0
    try:
        if "failed for element" in x:
            y=1
            error_flag.append(y)
        else:
            y=0
            error_flag.append(y)
    except:
        y=0
        error_flag.append(y)
            
park_scrape_dataset2['error_flag']=error_flag
bad_website_page = park_scrape_dataset2.loc[park_scrape_dataset2.error_flag > 0]
bad_website_page.to_csv("Error Website List.csv")

In [21]:
bad_website_page.to_csv("Error Website List.csv")