What is a pipeline?
- Work on code in discrete tasks
- Can increase throughput

insert pic of pipeline  

Question: What are some good candidates for splitting into different tasks ?


In [5]:
import asyncio
from csv import DictReader
import geopandas as gpd
import json
import pandas as pd
import itertools

from camping.mocks.request import RequestsMock
from camping.util.scraper import Scraper
from camping.util.distance import distance_merge

def max_col_width(w=100):
    pd.set_option('display.max_colwidth', w)

ridb_facilities_url = "https://ridb.recreation.gov/api/v1/facilities"

In [3]:
# Exploration code for reference
ridb_facilities_url = "https://ridb.recreation.gov/api/v1/facilities"
params = {"activity_id":9, "state":"OR"}
headers = {"accept": "application/json", "apikey": "key"}


response = RequestsMock.get(ridb_facilities_url, params, headers=headers)
camping_json  = json.loads(response.text)

# Do we really need the campgrounds in a dataframe?
df_ridb_camping = pd.DataFrame(camping_json['RECDATA'])

campground_info = pd.DataFrame()
for facility in camping_json['RECDATA']:
    if facility.get('FacilityID') is not None:
        campground_url = f"{ridb_facilities_url}/{facility['FacilityID']}/campsites"
        resp = RequestsMock.get(campground_url, headers=headers)
        if resp.status_code != 200:
            continue
        
        campsites = json.loads(resp.text)
        if len(campsites['RECDATA']) > 0:
            df_campsites = pd.DataFrame(campsites['RECDATA'])
            campground_info = campground_info.append(df_campsites[['FacilityID', 'CampsiteID', 'CampsiteName', 'ATTRIBUTES']].merge(df_ridb_camping, on='FacilityID', how='left'))
            
nf_data = []
with open('../data/NF_sites/OR_sitelist.csv') as f:
    reader = DictReader(f)
    for row in reader:
        sc = Scraper(row['site_url'], row['site_name'])
        nf_data.append(sc.scrape())
nf_df = pd.DataFrame(nf_data)  
merged = distance_merge(nf_df, campground_info, 2000, 'ridb', 'nf')

  return _prepare_from_string(" ".join(pjargs))
  return _prepare_from_string(" ".join(pjargs))


What can we parallelize ?
* By data source - NF can run independently of RIDB
* Within data source - Create batches

What makes sense as pipeline steps?
* Extracting the data from source
* Transforming campsite data
* Merging NF and RIDB

Configuration for scaling
* State
* NF urls
Add graphic of proposed pipeline

While Oregon is great, we might want to visit other states. How can we design an interface that will enable us to easily expand to other states?

In [4]:
# Start with what we want as paramaterized inputs and build from there
NF_sites = [
    ("East Lemolo Campground", "https://www.fs.usda.gov/recarea/umpqua/recarea/?recid=63492"),
    ("Magone Lake Campground", "https://www.fs.usda.gov/recarea/malheur/recarea/?recid=39964")
]
states = ['OR', 'WA', 'CA']

In [47]:
# use async to illustrate parallelization - will work for NF sites but not others ?
def get_facilities(state):
    params["state"] = state 
    response = RequestsMock.get(ridb_facilities_url, params, headers=headers)
#     await asyncio.sleep(2)
    if response.status_code == 200:
        print(f"Getting facilities for {state}")
        result = json.loads(response.text)
        return result['RECDATA']
    print(f"Unable to get result for state {state}, got {response.reason}")
    return {}


def get_campsites(facility_id):
    campsite_details_url = f"ridb_facilities_url/{facility_id}/campsites"
    print(f"Getting result for {campsite_details_url}")
    response = RequestsMock.get(campsite_details_url, headers=headers)
    if response.status_code == 200:
        campsites = json.loads(response.text)
        if len(campsites['RECDATA']) > 0:
             return pd.DataFrame(campsites['RECDATA'])
    print(f"Unable to get result for facility_id {facility_id}, got {response.reason}")
    return pd.DataFrame()


def get_ridb_camping(state):
    campground_info = pd.DataFrame()
    facilities = get_facilities(state)
    for facility in facilities:
        if facility.get('FacilityID') is not None:
            print(f"processing facility {facility['FacilityName']}")
            df_facility = pd.DataFrame(facility)
            df_campsites = get_campsites(facility.get('FacilityID'))
            if not df_campsites.empty:
                campground_info = campground_info.append(df_campsites[['FacilityID', 'CampsiteID', 'CampsiteName', 'ATTRIBUTES']].merge(df_facility, on='FacilityID', how='left'))
    return campground_info
        
def get_campsites_for(facilities):
    for facility_id in facilities:
        get_campsites(facility_id)
    

In [48]:
df = get_ridb_camping('OR')

Getting facilities for OR
processing facility Glendale - Powers Bicycle Recreation Area
Getting result for ridb_facilities_url/265224/campsites
Unable to get result for facility_id 265224, got OK
processing facility Head O' Boulder Forest Camp
Getting result for ridb_facilities_url/243702/campsites
Unable to get result for facility_id 243702, got OK
processing facility EAST LEMOLO CAMPGROUND
Getting result for ridb_facilities_url/251894/campsites
processing facility Ochoco NF-Maury Mountains Area
Getting result for ridb_facilities_url/236934/campsites
Unable to get result for facility_id 236934, got OK
processing facility Blue Mountain Byway East Interpretive Site
Getting result for ridb_facilities_url/246009/campsites
Unable to get result for facility_id 246009, got OK
processing facility Oregon Caves National Monument and Preserve Tours
Getting result for ridb_facilities_url/251610/campsites
Unable to get result for facility_id 251610, got OK
processing facility Jackman Park Campgrou

In [50]:
df.to_json("/tmp/ridb-")

Unnamed: 0,FacilityID,CampsiteID,CampsiteName,ATTRIBUTES,LegacyFacilityID,OrgFacilityID,ParentOrgID,ParentRecAreaID,FacilityName,FacilityDescription,...,FacilityMapURL,FacilityAdaAccess,GEOJSON,FacilityLongitude,FacilityLatitude,Keywords,StayLimit,Reservable,Enabled,LastUpdatedDate
0,251894,98358,008,"[{'AttributeName': 'Location Rating', 'Attribu...",135642,AN435642,131,1112,EAST LEMOLO CAMPGROUND,<h2>Overview</h2>\nEast Lemolo is on the banks...,...,,N,"[-122.1980222, 43.3127806]",-122.198022,43.312781,East Lemola Campground,,True,True,2021-04-14
1,251894,98358,008,"[{'AttributeName': 'Location Rating', 'Attribu...",135642,AN435642,131,1112,EAST LEMOLO CAMPGROUND,<h2>Overview</h2>\nEast Lemolo is on the banks...,...,,N,Point,-122.198022,43.312781,East Lemola Campground,,True,True,2021-04-14
2,251894,98441,014,"[{'AttributeName': 'Picnic Table', 'AttributeV...",135642,AN435642,131,1112,EAST LEMOLO CAMPGROUND,<h2>Overview</h2>\nEast Lemolo is on the banks...,...,,N,"[-122.1980222, 43.3127806]",-122.198022,43.312781,East Lemola Campground,,True,True,2021-04-14
3,251894,98441,014,"[{'AttributeName': 'Picnic Table', 'AttributeV...",135642,AN435642,131,1112,EAST LEMOLO CAMPGROUND,<h2>Overview</h2>\nEast Lemolo is on the banks...,...,,N,Point,-122.198022,43.312781,East Lemola Campground,,True,True,2021-04-14
4,251894,98438,004,"[{'AttributeName': 'Picnic Table', 'AttributeV...",135642,AN435642,131,1112,EAST LEMOLO CAMPGROUND,<h2>Overview</h2>\nEast Lemolo is on the banks...,...,,N,"[-122.1980222, 43.3127806]",-122.198022,43.312781,East Lemola Campground,,True,True,2021-04-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,251434,96013,B002,"[{'AttributeName': 'Driveway Length', 'Attribu...",125541,AN425541,131,1106,LOST LAKE RESORT AND CAMPGROUND,<h2>Overview</h2>\n<p>Lost Lake Campground is ...,...,,N,Point,-121.821944,45.488889,,,True,True,2021-04-13
96,251434,96009,D004,"[{'AttributeName': 'Grills/Fire Ring', 'Attrib...",125541,AN425541,131,1106,LOST LAKE RESORT AND CAMPGROUND,<h2>Overview</h2>\n<p>Lost Lake Campground is ...,...,,N,"[-121.8219444, 45.4888889]",-121.821944,45.488889,,,True,True,2021-04-13
97,251434,96009,D004,"[{'AttributeName': 'Grills/Fire Ring', 'Attrib...",125541,AN425541,131,1106,LOST LAKE RESORT AND CAMPGROUND,<h2>Overview</h2>\n<p>Lost Lake Campground is ...,...,,N,Point,-121.821944,45.488889,,,True,True,2021-04-13
98,251434,96085,D015,"[{'AttributeName': 'Grills/Fire Ring', 'Attrib...",125541,AN425541,131,1106,LOST LAKE RESORT AND CAMPGROUND,<h2>Overview</h2>\n<p>Lost Lake Campground is ...,...,,N,"[-121.8219444, 45.4888889]",-121.821944,45.488889,,,True,True,2021-04-13


In [15]:
tasks = []
for state in states:
    tasks.append(asyncio.create_task(get_facilities(state)))

In [16]:
res = await asyncio.gather(*tasks)

Getting facilities for OR
Getting facilities for WA
Getting facilities for CA
