In [69]:
import requests	
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
from sqlalchemy import create_engine
import config
from unidecode import unidecode
from camping import camping_data
from camping import storage
import datetime

In [7]:
campgrounds = pd.read_csv('../data/usfs_sites.csv')

In [13]:
store = storage.Storage()

In [27]:

def extract_cg_info(campgrounds) :
    df_cg = pd.DataFrame()
    for index, campground in campgrounds.iterrows():
        latitude = longitude = elevation = site_id = numsites = conditions = ""
        fees = openseason = usage = water = restroom = status = ""
        site_url = "http://" + config.LAMP_IP + "/" + campground['url']
        print(campground['facilityname'] + '\t' + site_url)
        try :
            cg_req = requests.get(site_url)
            cg_soup = BeautifulSoup(cg_req.text, 'lxml')
        except Exception as ex :
            print('couldnt get site_url ' + site_url)
            print(ex)
            continue

        # get area status if available
        try :
            for strong_tag in cg_soup.find_all('strong'):
                if ('Area Status' in unidecode(strong_tag.text)):
                    status = unidecode(strong_tag.next_sibling).strip()
        except Exception:
            print('couldnt get area status')


        print("getting location")
        # get lat, long, altitude
        try :
            lat = cg_soup.find_all('div', text=re.compile('Latitude'))
            div = [row.next_sibling.next_sibling for row in lat]
            latitude  = div[0].text.strip()

            lng = cg_soup.find_all('div', text=re.compile('Longitude'))
            div = [row.next_sibling.next_sibling for row in lng]
            longitude  = div[0].text.strip()

            el = cg_soup.find_all('div', text=re.compile('Elevation'))
            div = [row.next_sibling.next_sibling for row in el]
            elevation  = div[0].text.strip()

            # get site usage, type, num sites, site info

        except Exception:
            print('couldnt get location info')

        # table[0] is the basic info table

        try :
            tables = cg_soup.find_all('div', {'class': 'tablecolor'})
        except Exception:
            print('couldnt get tables')

        try :
            print(len(tables))
            rows = tables[0].find_all('tr')
            for row in rows:
                
                if row.th.text == 'Reservations:':
                    reservations = unidecode(row.td.text).strip()
                if row.th.text == 'Open Season:':
                    openseason = unidecode(row.td.text).strip()
                if row.th.text == 'Current Conditions:':
                    print(unidecode(row.td.text))
                    conditions = unidecode(row.td.text).strip()
                if row.th.text == 'Water:':
                    water = unidecode(row.td.text).strip()
                if row.th.text == 'Restroom:':
                    restroom = unidecode(row.td.text).strip()
        except Exception as ex:
            print('couldnt get basic campground info')
            print(ex)

        # table 1 is the campground info
        try:
            rows = tables[1].find_all('tr')

            for row in rows:
                if row.td.text == 'Reservation Info':
                    reserveinfo = unidecode(row.td.next_sibling.text).strip()
                if row.td.text == 'No. of Sites':
                    numsites = unidecode(row.td.next_sibling.text).strip()
        except Exception:
            print('couldnt get campsite availability info')
        
        # assemble into DataFrame
        df_cg = df_cg.append(pd.DataFrame ({
            'latitude': [latitude],
            'longitude': [longitude],
            'elevation': [elevation],
            'facilityname' : [campground['facilityname']],
            'facilityurl' : [site_url],
            'status' : [status],
            'water' : [water],
            'restroom' : [restroom],
            'reservations': [reservations],
            'conditions': [conditions],
            'numsites': [numsites]
            }))
    return df_cg

In [40]:
usfs_data = extract_cg_info(campgrounds)

whispering falls campground	http://172.17.0.3/Willamette National Forest - Whispering Falls Campground.html
getting location
5
lost lake campground	http://172.17.0.3/Mt. Hood National Forest - Lost Lake Campground.html
getting location
10
OPEN 
lake harriet campground	http://172.17.0.3/Mt. Hood National Forest - Lake Harriet Campground_Day Use.html
getting location
2
kinnikinnick (laurance lake) campground	http://172.17.0.3/Mt. Hood National Forest - Kinnikinnick (Laurance Lake) Campground.html
getting location
9
OPEN 
fifteenmile campground	http://172.17.0.3/Mt. Hood National Forest - Fifteenmile Campground.html
getting location
5
No services provided.A  Pack in/Pack out. 
clear lake campground	http://172.17.0.3/Mt. Hood National Forest - Clear Lake Campground.html
getting location
8
CLOSED 
camp creek campground	http://172.17.0.3/Mt. Hood National Forest - Camp Creek Campground.html
getting location
5
badger lake campground	http://172.17.0.3/Mt. Hood National Forest - Badger Lake Cam

In [41]:
usfs_data.shape


(9, 11)

In [52]:
usfs_data.facilityname

0                whispering falls campground
0                       lost lake campground
0                    lake harriet campground
0    kinnikinnick (laurance lake) campground
0                     fifteenmile campground
0                      clear lake campground
0                      camp creek campground
0                     badger lake campground
0                     eagle creek campground
Name: facilityname, dtype: object

In [31]:
ridb_cleaned = store.get("select * from ridb_cleaned")

In [53]:
ridb_cleaned.FacilityName

0                 whispering falls campground
1                                lake harriet
2     kinnikinnick (laurance lake) campground
3                      fifteenmile campground
4                                  camp creek
5                 lower camp creek campground
6                      badger lake campground
7                             clear lake (or)
8                    clear lake cabin lookout
9                      eagle creek campground
10                     eagle creek campground
11                           lost lake resort
12                       lost lake campground
13            lost lake resort and campground
14                   wildwood recreation site
15                        wildwood campground
Name: FacilityName, dtype: object

In [57]:
usfs_data = usfs_data.assign(first_two=usfs_data.facilityname.str.lower().str.split(' ').apply(lambda x: ' '.join([x[0],x[1]])))

In [58]:
usfs_data.first_two

0          whispering falls
0                 lost lake
0              lake harriet
0    kinnikinnick (laurance
0    fifteenmile campground
0                clear lake
0                camp creek
0               badger lake
0               eagle creek
Name: first_two, dtype: object

In [60]:
ridb_cleaned = ridb_cleaned.assign(first_two=ridb_cleaned.FacilityName.str.lower().str.split(' ').apply(lambda x: ' '.join([x[0],x[1]])))

In [111]:
pd.to_datetime(ridb_cleaned.iloc[0]['LastUpdatedDate'])

Timestamp('2016-05-12 00:00:00')

In [115]:
test = ridb_cleaned.assign(convert = lambda x: pd.to_datetime(x['LastUpdatedDate']))
min(test.convert)

Timestamp('2015-10-15 00:00:00')

In [120]:
ridb_grouped = ridb_cleaned.groupby('first_two')

In [137]:
deduped = pd.DataFrame()
for name,group in ridb_grouped:
    # return most recently updated entry in duplicate groups
   # print(group[['FacilityName','LastUpdatedDate']])
   # print(max(pd.to_datetime(group['LastUpdatedDate'])))
   deduped = deduped.append(group.groupby(pd.to_datetime(group['LastUpdatedDate'])).first())

In [143]:
deduped.FacilityName

LastUpdatedDate
2016-05-09                     badger lake campground
2016-05-12                                 camp creek
2016-05-12                            clear lake (or)
2015-12-03                     eagle creek campground
2016-05-09                     eagle creek campground
2016-05-09                     fifteenmile campground
2016-05-09    kinnikinnick (laurance lake) campground
2016-05-12                               lake harriet
2015-10-15            lost lake resort and campground
2016-05-09                           lost lake resort
2016-05-09                lower camp creek campground
2016-05-12                whispering falls campground
2016-05-09                        wildwood campground
2016-05-12                   wildwood recreation site
Name: FacilityName, dtype: object

Index(['index', 'FacilityAdaAccess', 'FacilityDescription',
       'FacilityDirections', 'FacilityEmail', 'FacilityID', 'FacilityLatitude',
       'FacilityLongitude', 'FacilityMapURL', 'FacilityName', 'FacilityPhone',
       'FacilityReservationURL', 'FacilityTypeDescription',
       'FacilityUseFeeDescription', 'Keywords', 'LastUpdatedDate',
       'LegacyFacilityID', 'OrgFacilityID', 'StayLimit', 'distance_miles',
       'distance_ref', 'duplicated', 'first_word', 'first_two',
       'datetime_last_updated'],
      dtype='object')