What is a pipeline?
- Work on code in discrete tasks
- Can increase throughput

insert pic of pipeline  

Question: What are some good candidates for splitting into different tasks ?


In [5]:
import asyncio
from csv import DictReader
import geopandas as gpd
import json
import pandas as pd
import itertools

from camping.mocks.request import RequestsMock
from camping.util.scraper import Scraper
from camping.util.distance import distance_merge

def max_col_width(w=100):
    pd.set_option('display.max_colwidth', w)

ridb_facilities_url = "https://ridb.recreation.gov/api/v1/facilities"

In [3]:
# Exploration code for reference
ridb_facilities_url = "https://ridb.recreation.gov/api/v1/facilities"
params = {"activity_id":9, "state":"OR"}
headers = {"accept": "application/json", "apikey": "key"}


response = RequestsMock.get(ridb_facilities_url, params, headers=headers)
camping_json  = json.loads(response.text)

# Do we really need the campgrounds in a dataframe?
df_ridb_camping = pd.DataFrame(camping_json['RECDATA'])

campground_info = pd.DataFrame()
for facility in camping_json['RECDATA']:
    if facility.get('FacilityID') is not None:
        campground_url = f"{ridb_facilities_url}/{facility['FacilityID']}/campsites"
        resp = RequestsMock.get(campground_url, headers=headers)
        if resp.status_code != 200:
            continue
        
        campsites = json.loads(resp.text)
        if len(campsites['RECDATA']) > 0:
            df_campsites = pd.DataFrame(campsites['RECDATA'])
            campground_info = campground_info.append(df_campsites[['FacilityID', 'CampsiteID', 'CampsiteName', 'ATTRIBUTES']].merge(df_ridb_camping, on='FacilityID', how='left'))
            
nf_data = []
with open('../data/NF_sites/OR_sitelist.csv') as f:
    reader = DictReader(f)
    for row in reader:
        sc = Scraper(row['site_url'], row['site_name'])
        nf_data.append(sc.scrape())
nf_df = pd.DataFrame(nf_data)  
merged = distance_merge(nf_df, campground_info, 2000, 'ridb', 'nf')

  return _prepare_from_string(" ".join(pjargs))
  return _prepare_from_string(" ".join(pjargs))


What can we parallelize ?
* By data source - NF can run independently of RIDB
* Within data source - Create batches

What makes sense as pipeline steps?
* Extracting the data from source
* Transforming campsite data
* Merging NF and RIDB

Configuration for scaling
* State
* NF urls
Add graphic of proposed pipeline

While Oregon is great, we might want to visit other states. How can we design an interface that will enable us to easily expand to other states?

In [4]:
# Start with what we want as paramaterized inputs and build from there
NF_sites = [
    ("East Lemolo Campground", "https://www.fs.usda.gov/recarea/umpqua/recarea/?recid=63492"),
    ("Magone Lake Campground", "https://www.fs.usda.gov/recarea/malheur/recarea/?recid=39964")
]
states = ['OR', 'WA', 'CA']

In [59]:
# use async to illustrate parallelization - will work for NF sites but not others ?
def get_facilities(state):
    params["state"] = state 
    response = RequestsMock.get(ridb_facilities_url, params, headers=headers)
#     await asyncio.sleep(2)
    if response.status_code == 200:
        print(f"Getting facilities for {state}")
        result = json.loads(response.text)
        return result['RECDATA']
    print(f"Unable to get result for state {state}, got {response.reason}")
    return {}


def get_campsites(facility_id):
    campsite_details_url = f"ridb_facilities_url/{facility_id}/campsites"
    response = RequestsMock.get(campsite_details_url, headers=headers)
    if response.status_code == 200:
        campsites = json.loads(response.text)
        if len(campsites['RECDATA']) > 0:
            return pd.DataFrame(campsites['RECDATA'])
        else:
            return pd.DataFrame()
    print(f"Unable to get result for facility_id {facility_id}, got {response.code} {response.reason}")
    return pd.DataFrame()


def get_ridb_camping(state):
    campground_info = pd.DataFrame()
    facilities = get_facilities(state)
    for facility in facilities:
        if facility.get('FacilityID') is not None:
            df_facility = pd.DataFrame(facility)
            df_campsites = get_campsites(facility.get('FacilityID'))
            if not df_campsites.empty:
                campground_info = campground_info.append(df_campsites[['FacilityID', 'CampsiteID', 'CampsiteName', 'ATTRIBUTES']].merge(df_facility, on='FacilityID', how='left'))
    return campground_info
        
def get_campsites_for(facilities):
    for facility_id in facilities:
        get_campsites(facility_id)
    

In [60]:
df = get_ridb_camping('OR')

Getting facilities for OR


In [52]:
df.to_json("/tmp/ridb_or.json", orient='records')

In [15]:
tasks = []
for state in states:
    tasks.append(asyncio.create_task(get_facilities(state)))

In [16]:
res = await asyncio.gather(*tasks)

Getting facilities for OR
Getting facilities for WA
Getting facilities for CA
