![Many pancakes vs single pancake](images/pancakes.png)

#### How might our camping application scale?
* More states
* Additional data sources
* More campsites per facility
* ?

In [15]:
from csv import DictReader
import geopandas as gpd
import json
import pandas as pd
import itertools

from camping.mocks.request import RequestsMock
from camping.util.scraper import Scraper
from camping.util.distance import distance_merge

def max_col_width(w=100):
    pd.set_option('display.max_colwidth', w)

ridb_facilities_url = "https://ridb.recreation.gov/api/v1/facilities"

### Scaling our prototype code

What are some aspects that might not scale?   
*   
  
Opportunities to parallelize?  
*   

#### Exploration code for reference

In [None]:
params = {"activity_id":9, "state":"OR"}
headers = {"accept": "application/json", "apikey": "key"}

# Get RIDB facilities data
response = RequestsMock.get(ridb_facilities_url, params, headers=headers)
camping_json  = json.loads(response.text)
df_ridb_camping = pd.DataFrame(camping_json['RECDATA'])

# Get RIDB campground data for each facility
campground_info = pd.DataFrame()
for facility in camping_json['RECDATA']:
    if facility.get('FacilityID') is not None:
        campground_url = f"{ridb_facilities_url}/{facility['FacilityID']}/campsites"
        resp = RequestsMock.get(campground_url, headers=headers)
        if resp.status_code != 200:
            continue
        
        campsites = json.loads(resp.text)
        if len(campsites['RECDATA']) > 0:
            df_campsites = pd.DataFrame(campsites['RECDATA'])
            campground_info = campground_info.append(df_campsites[['FacilityID', 'CampsiteID', 'CampsiteName', 'ATTRIBUTES']].merge(df_ridb_camping, on='FacilityID', how='left'))

# Get NF website data
nf_data = []
with open('../data/NF_sites/OR_sitelist.csv') as f:
    reader = DictReader(f)
    for row in reader:
        sc = Scraper(row['site_url'], row['site_name'])
        nf_data.append(sc.scrape())
nf_df = pd.DataFrame(nf_data) 

# Merge RIDB and NF data
merged = distance_merge(nf_df, campground_info, 2000, 'ridb', 'nf')


### Pipelines
Data pipelines split data processing into discrete steps. 

* Allows retry, such as if the API rate limit is exceeded 
* Generalized pipeline steps can be reused 

![RIDB pipeline](images/RIDB_pipeline.png)


What can we parallelize ?
* By data source - NF can run independently of RIDB
* Within data source - Create batches

What makes sense as pipeline steps?
* Extracting the data from source
* Transforming campsite data
* Merging NF and RIDB

Configuration for scaling
* State
* NF urls


In [4]:
# Start with getting facilities
def get_facilities(state):
    params = {'state':state}
    response = RequestsMock.get(ridb_facilities_url, params, headers=headers)
    if response.status_code == 200:
        print(f"Getting facilities for {state}")
        result = json.loads(response.text)
        return result['RECDATA']
    print(f"Unable to get result for state {state}, got {response.reason}")
    return {}

In [12]:
facilities_json = get_facilities('OR')
facilities_json

Getting facilities for OR


[{'FacilityID': '265224',
  'LegacyFacilityID': '',
  'OrgFacilityID': '77603',
  'ParentOrgID': '131',
  'ParentRecAreaID': '1108',
  'FacilityName': 'Glendale - Powers Bicycle Recreation Area',
  'FacilityDescription': '<p>The Oregon Coast is a cycling attraction that is gaining worldwide recognition for its stunning and diverse landscapes, and the areas just inland from that majestic Highway 101 are equally impressive! The Glendale-Powers route offers the opportunity to bicycle through the ancient forests\xa0on the west side of the\xa0Rogue River-Siskiyou National Forest, along the beautiful Coquille River! Escape into a world of pristine beauty as you pass through\xa0wildflower-filled meadows, ride along clean, clear waters, and enjoy the majesty of the towering trees. Every turn of the road reveals all manner of big and small wildlife, including elk, bobcat, quail, and trout. This area provides for such amazing road biking opportunities, there is even an family-friendly annual rid

In [16]:
# Need facility IDs 
facility_ids = [f['FacilityID'] for f in facilities_json]
facility_ids[0:9]

['265224',
 '243702',
 '251894',
 '236934',
 '246009',
 '251610',
 '10005523',
 '10078676',
 '237138']

In [45]:
def get_campsites(facility_id):
    campsite_details_url = f"ridb_facilities_url/{facility_id}/campsites"
    response = RequestsMock.get(campsite_details_url, headers=headers)
    if response.status_code == 200:
        campsites = json.loads(response.text)
        if len(campsites['RECDATA']) > 0:
            return campsites['RECDATA']
        else:
            return {}
    print(f"Unable to get result for facility_id {facility_id}, got {response.code} {response.reason}")
    return {}

In [46]:
def process_campsite(campsite_data):
    campsite_data['AttributeDict'] = {item['AttributeName']: item['AttributeValue'] for item in campsite_data['ATTRIBUTES']}
    return {key: campsite_data.get(key) for key in ['FacilityID', 'CampsiteID', 'CampsiteName', 'AttributeDict']}

![RIDB pipeline](images/campsite_batch.png)

In [None]:
def get_campsite_batch(ids):
    campsite_data = []
    for id in ids:
        campsite_data = campsite_data.append(get_campsites(id))
    return campsite_data

def process_campsite_batch(campsite_data):
    for site in campsite_data:
        

In [23]:
states = ['OR']
facilities_json = get_facilities('OR')
facility_ids = [f['FacilityID'] for f in facilities_json]
num_facilities = len(facility_ids)
step = int(num_facilities/10)
for index in range(0, num_facilities, step):
    get_campsite_batch(facility_ids[index:index+step])

Getting facilities for OR
num facilities: 51, step: 5
getting indicies 0:5
['265224', '243702', '251894', '236934', '246009']
getting indicies 5:10
['251610', '10005523', '10078676', '237138', '264652']
getting indicies 10:15
['234274', '266138', '252440', '10001666', '203855']
getting indicies 15:20
['234161', '274330', '233259', '201777', '233707']
getting indicies 20:25
['237093', '248324', '250081', '251916', '255180']
getting indicies 25:30
['251944', '251991', '266406', '265397', '251986']
getting indicies 30:35
['271517', '243714', '10056001', '252308', '234185']
getting indicies 35:40
['233136', '267554', '248321', '251940', '266415']
getting indicies 40:45
['233326', '251782', '234144', '244119', '246056']
getting indicies 45:50
['237822', '251408', '248307', '262756', '237886']
getting indicies 50:55
['251434']


In [24]:
facility_ids[100:200]

[]

In [4]:
# Start with what we want as paramaterized inputs and build from there
NF_sites = [
    ("East Lemolo Campground", "https://www.fs.usda.gov/recarea/umpqua/recarea/?recid=63492"),
    ("Magone Lake Campground", "https://www.fs.usda.gov/recarea/malheur/recarea/?recid=39964")
]
states = ['OR', 'WA', 'CA']

In [59]:



def get_campsites(facility_id):
    campsite_details_url = f"ridb_facilities_url/{facility_id}/campsites"
    response = RequestsMock.get(campsite_details_url, headers=headers)
    if response.status_code == 200:
        campsites = json.loads(response.text)
        if len(campsites['RECDATA']) > 0:
            return pd.DataFrame(campsites['RECDATA'])
        else:
            return pd.DataFrame()
    print(f"Unable to get result for facility_id {facility_id}, got {response.code} {response.reason}")
    return pd.DataFrame()


def get_ridb_camping(state):
    campground_info = pd.DataFrame()
    facilities = get_facilities(state)
    for facility in facilities:
        if facility.get('FacilityID') is not None:
            df_facility = pd.DataFrame(facility)
            df_campsites = get_campsites(facility.get('FacilityID'))
            if not df_campsites.empty:
                campground_info = campground_info.append(df_campsites[['FacilityID', 'CampsiteID', 'CampsiteName', 'ATTRIBUTES']].merge(df_facility, on='FacilityID', how='left'))
    return campground_info
        
def get_campsites_for(facilities):
    for facility_id in facilities:
        get_campsites(facility_id)
    

In [60]:
df = get_ridb_camping('OR')

Getting facilities for OR


In [52]:
df.to_json("/tmp/ridb_or.json", orient='records')

In [15]:
tasks = []
for state in states:
    tasks.append(asyncio.create_task(get_facilities(state)))

In [16]:
res = await asyncio.gather(*tasks)

Getting facilities for OR
Getting facilities for WA
Getting facilities for CA
