
<h3>Scraping content from US National Forests webpages</h3>

<img src="images/Gimli.jpg" width=30% style="display: inline-block">
<br>
image source: lotr.wikia.com
<br>
It may not be pretty or elegant, but it gets the job done
<br>
<br>
[Example: Cultus Creek Campground FS website](http://www.fs.usda.gov/recarea/giffordpinchot/recreation/camping-cabins/recarea/?recid=31736&actid=29)
<img src="images/cultus_creek_screenshot.png" width=80% style="display: inline-block">
<br>
<br>
Everyone's favorite: view source
<img src="images/cultus_creek_viewsource.png" width=80% style="display: inline-block">

In [3]:
import requests	
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
from sqlalchemy import create_engine
import config

In [9]:
campgrounds = pd.read_csv('or_nf_campgrounds.csv')
campgrounds = campgrounds.head()

In [15]:
def extract_cg_info(campgrounds, area, outfile) :
    for index, campground in campgrounds.iterrows():
        latitude = longitude = elevation = site_id = numsites = ""
        fees = openseason = usage = water = restroom = status = ""

        print(campground['site_name'] + '\t' + campground['site_url'])
        try :
            cg_req = requests.get(campground['site_url'])
            cg_soup = BeautifulSoup(cg_req.text, 'lxml')
            site_id = campground['site_url'].split('?')[1].split('&')[0].split('=')[1]
        except Exception as ex :
            print('couldnt get site_url ' + campground['site_url'])
            print(ex)
            continue

        # get area status if available
        try :
            for strong_tag in cg_soup.find_all('strong'):
                if ('Area Status' in unidecode(strong_tag.text)):
                    status = unidecode(strong_tag.next_sibling).strip()
        except Exception:
            print('couldnt get area status')


        print("getting location")
        # get lat, long, altitude
        try :
            lat = cg_soup.find_all('div', text=re.compile('Latitude'))
            div = [row.next_sibling.next_sibling for row in lat]
            latitude  = div[0].text.strip()

            lng = cg_soup.find_all('div', text=re.compile('Longitude'))
            div = [row.next_sibling.next_sibling for row in lng]
            longitude  = div[0].text.strip()

            el = cg_soup.find_all('div', text=re.compile('Elevation'))
            div = [row.next_sibling.next_sibling for row in el]
            elevation  = div[0].text.strip()

            # get site usage, type, num sites, site info

        except Exception:
            print('couldnt get location info')

        # table[0] is the basic info table
        print("getting basic info")

        try :
            tables = cg_soup.find_all('div', {'class': 'tablecolor'})
        except Exception:
            print('couldnt get tables')

        try :
            rows = tables[0].find_all('tr')

            for row in rows:
                if row.th.text == 'Reservations:':
                    reservations = unidecode(row.td.text).strip()
                if row.th.text == 'Open Season:':
                    openseason = unidecode(row.td.text).strip()
                if row.th.text == 'Current Conditions:':
                    conditions = unidecode(row.td.text).strip()
                if row.th.text == 'Water:':
                    water = unidecode(row.td.text).strip()
                if row.th.text == 'Restroom:':
                    restroom = unidecode(row.td.text).strip()
        except Exception :
            print('couldnt get basic campground info')

        # table 1 is the campground info
        print("getting reservation info")
        try:
            rows = tables[1].find_all('tr')

            for row in rows:
                if row.td.text == 'Reservation Info':
                    reserveinfo = unidecode(row.td.next_sibling.text).strip()
                if row.td.text == 'No. of Sites':
                    numsites = unidecode(row.td.next_sibling.text).strip()
        except Exception:
            print('couldnt get campsite availability info')
        
        # assemble into DataFrame
        print('appending data')
        df_cg = pd.DataFrame ({
            'latitude': [latitude],
            'longitude': [longitude],
            'elevation': [elevation],
            'facilityname' : [campground['site_name']],
            'facilityurl' : [campground['site_url']],
            'facilityid' : [site_id],
            'datasrc': ["NFS"],
            'status' : [status],
            'openseason' : [openseason],
            'water' : [water],
            'restroom' : [restroom],
            'reserveinfo': [reserveinfo],
            'reservations': [reservations],
            'conditions': [conditions],
            'numsites': [numsites]
            })

        #print df_cg
        print('writing to file')

        if not os.path.isfile(outfile):
           df_cg.to_csv(outfile,header ='column_names', index=False)
        else: # else it exists so append without writing the header
            df_cg.to_csv(outfile,mode = 'a',header=False, index=False)

In [16]:
test = extract_cg_info(campgrounds, "Mt Hood NF", "out.csv")

Badger Lake Campground	http://www.fs.usda.gov/recarea/mthood/recreation/camping-cabins/recarea/?recid=52784&actid=29
couldnt get site_url http://www.fs.usda.gov/recarea/mthood/recreation/camping-cabins/recarea/?recid=52784&actid=29
Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Bear Springs Group Campground	http://www.fs.usda.gov/recarea/mthood/recreation/camping-cabins/recarea/?recid=52786&actid=29
couldnt get site_url http://www.fs.usda.gov/recarea/mthood/recreation/camping-cabins/recarea/?recid=52786&actid=29
Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
Bonney Crossing Campground	http://www.fs.usda.gov/recarea/mthood/recreation/camping-cabins/recarea/?recid=52790&actid=29
couldnt get site_url http://www.fs.usda.gov/recarea/mthood/recreation/camping-cabins/recarea/?recid=52790&actid=29
Couldn't find a tree builder with the features you requested: lxml. Do you 

AttributeError: 'NoneType' object has no attribute 'shape'