# Batch HydroLink 
### Developed By: Daniel Wieferich (USGS)
### 20171005
### Last updated: 20180824

#### Note: This code is a work in progress.  Any suggestions and comments are welcomed.


#### This code performs a hydrolink batch process on a text file (.csv ) or shapefile (.shp) using user defined latitude and longitude fields.  The code returns the reachcode and measure of the closest position on the High Resolution National Hydrography Dataset, the Medium Resolution NHDPlusV2.1, or both versions of NHD using web services (no downloading of NHD data is necessary).  The code also uses snap distance and stream name to help quantify a level of certainty.  Levels of certainty are a work in progress.

#### Next steps: 
1. Work with the NHD High Resolution team to see if additional information can be returned from services to help with assigning more levels of certainty. (in progress)
2. It would be great to get a measure of distance to closest confluence as a measure of certainty.  Through manual linkage in the past I've noticed that points within 50m of a confluence are more likely to link to the wrong reach.
3. The current code assumes lat/lon are in NAD83.  The code should be altered to accept various crs inputs.
4. Improve upon error checking and notifications
5. Allow for join of hydro-linked data to original dataset?

In [1]:
#The import_file function deals with importing csv or shps into a dataframe. It makes sure that all fields are 
#in the file and converts the file into a dataframe with a standard set of field names.
def import_file(input_file, latitude_field, longitude_field, stream_name_field, identifier_field):
    import pandas as pd
    import geopandas as gpd
    
    #Check to see if the file is a csv file
    if input_file.endswith('.csv'):
        print ('\n' + 'reading csv file' +'\n')
        try:
            df = pd.read_csv(input_file)
        except KeyError:
            print ('file did not properly import, verify file name and rerun')
            #It would be nice here to reask for inputFileName and then restart at try statement
    
    #If input file is not a csv check to see if it is a shapefile
    elif input_file.endswith('.shp'):
        print('\n' + 'reading shapefile' + '\n')
        try:
            df = gp.GeoDataFrame.from_file(input_file)
        except KeyError:
            print ('file did not properly import, verify file name and rerun')
            #It would be nice here to reask for inputFileName and then restart at try statement
    
    #If input file is not a csv or shapefile tell the user that the file type is not excepted
    else:
        print('File type not currently accepted. Please try .csv or .shp')
        
    if latitude_field in df and longitude_field in df and stream_name_field in df and identifier_field in df:
        df = df[[identifier_field, latitude_field, longitude_field, stream_name_field]].copy()
        df = df.rename(columns={identifier_field: 'id', latitude_field: 'lat', longitude_field: 'lon', stream_name_field: 'stream'})
        return df
    else: 
        print ('verify field names and rerun')
        
#Using initial lat,lon (in NAD83 ... crs 4629) to link to High Resolution NHD
def hydrolink_hr(lat,lon,input_id):
    
    base_url_hr = 'https://edits.nationalmap.gov/arcgis/rest/services/HEM/NHDHigh/MapServer/'
    get_hr_reach = base_url_hr + 'exts/Vwe_HEM_Soe/HEMGetReachcodeFromXY'
    get_hr_xy = base_url_hr + 'exts/Vwe_HEM_Soe/HEMPointEvents'
    
    #Define variables, set initially as null values using similar denotation as the HydroLink Tool
    reachcode_hr = 'NO REACHCODE'
    meas_hr = -999
    smdate_hr = ' '
    perm_id_hr = 'NO PERM ID'
    xy_hr = '-999'
    gnis_name_hr = ' '
    
    
    #Structure original lat/lon into format needed for HEM SOE
    xy = '{"x":' + str(lon) + ',"y":' + str(lat) + ', "spatialReference": {"wkid":4269}}'
    
    if lat and (float(lat) > 24 and float(lat)<50) and lon and (float(lon) < -66 and float(lon)> -125):
        payload = {
            "point": xy ,
            "selectionLayerName": "NHDFLOWLINE",
            "selectionType": "TOPDOWNSTREAM",
            "searchToleranceMeters": 1000,
            "outWKID": 4269,
            "f": "json"}
        
        #Connects to web service
        r = requests.post(get_hr_reach,params=payload,verify=False).json()
        
        try:
            
                if r['resultStatus'] == 'success' and r['features']:
                    reachcode_hr = r['features'][0]['attributes']['REACHCODE']
                    payload2 = {
                      "point": xy ,
                      "reachcode": reachcode_hr,
                      "searchToleranceMeters": 1000,
                      "outWKID": 4269,
                      "f": "json"}

                    r2 = requests.post(get_hr_xy,params=payload2,verify=False).json()
                    meas_hr = r2['features'][0]['attributes']['MEASURE']
                    smdate_hr = r2['features'][0]['attributes']['REACHSMDATE']
                    perm_id_hr = r2['features'][0]['attributes']['PERMANENT_IDENTIFIER']
                    xy_hr = r2['features'][0]['geometry']
                    gnis_name_hr = r2['features'][0]['attributes']['GNIS_NAME']
                    
                    return (reachcode_hr,meas_hr,smdate_hr,perm_id_hr,xy_hr,gnis_name_hr)
                

                else:
                    return (reachcode_hr,meas_hr,smdate_hr,perm_id_hr,xy_hr, gnis_name_hr)
        except:
            print ('Failed to process HR for: ' + str(input_id))
            return (reachcode_hr,meas_hr,smdate_hr,perm_id_hr,xy_hr,gnis_name_hr)
    else:
        return (reachcode_hr,meas_hr,smdate_hr,perm_id_hr,xy_hr,gnis_name_hr)

        
def hydrolink_mr2(lat,lon,input_id):
    
    #Define variables, set initially as null values using similar denotation as the HydroLink Tool
    snap_dist_mr2 = -999
    xy_mr2 = ''
    comid_mr2 = 'NO COMID'
    reach_mr2 = 'NO REACHCODE'
    meas_mr2= -999
    gnis_name_mr2 = ''
    

    
    #Additional Info about medium resolution service : https://www.epa.gov/waterdata/point-indexing-service
    xy_mr2 = 'POINT(' + str(lon) + ' ' + str(lat) + ')'
    
    payload_mr2 = {
        'optNHDPlusDataset': '2.1',
        'pGeometry': xy_mr2,
        'pGeometryMod': 'SRID=8265',
        'pOutputPathFlag': 'TRUE',
        'pPointIndexingMaxDist': '1', #Kilometers
        'pPointIndexingMethod':'DISTANCE',
        'pResolution': '3',
        'pPointIndexingFcodeDeny' : [56600],
        'pReturnFlowlineGeomFlag':'FALSE',
        'f':'json'}
    
    base_url_mr2= 'https://ofmpub.epa.gov/waters10/PointIndexing.Service'
    
    #Connects to web service
    r_mr2 = requests.post(base_url_mr2,params=payload_mr2, verify=False).json()
    
    try:
        snap_dist_mr2 = r_mr2['output']['total_distance']
        xy_mr2 = r_mr2['output']['end_point']['coordinates']
        comid_mr2 = r_mr2['output']['ary_flowlines'][0]['comid']
        reach_mr2 = r_mr2['output']['ary_flowlines'][0]['reachcode']
        meas_mr2 = r_mr2['output']['ary_flowlines'][0]['fmeasure']
        gnis_name_mr2 = r_mr2['output']['ary_flowlines'][0]['gnis_name']
        return (snap_dist_mr2,xy_mr2,comid_mr2,reach_mr2,meas_mr2,gnis_name_mr2)
       
    except:
        return (snap_dist_mr2,xy_mr2,comid_mr2,reach_mr2,meas_mr2,gnis_name_mr2)
    

def clean_stream_name(stream):
    
    if stream:
        #remove case for case sensitive operations
        stream = stream.lower()
        
        #replace common abbreviations, this needs improvement but be careful not to replace strings we dont want to
        #this code currently assumes GNIS_NAME never contains abbreviations... something to verify
        #If you have a better way to do this let me know!!!!
        stream = stream.replace(' st ', 'stream')
        stream = stream.replace(' st.', 'stream')
        stream = stream.replace(' rv ', 'river')
        stream = stream.replace(' rv.', 'river')
        stream = stream.replace(' trib.', 'tributary')
        stream = stream.replace(' trib)', 'tributary')
        stream = stream.replace(' trib ', 'tributary')
        stream = stream.replace(' ck ', 'creek')
        stream = stream.replace(' ck.', 'creek')
        stream = stream.replace(' br ', 'branch')
        stream = stream.replace(' br.', 'branch')
        
        return stream
    else:
        return stream
    
def stream_name_match(stream_clean, gnis_name):
    import difflib
    
    #Stream Name Match
    if stream_clean and gnis_name:
        gnis_name = gnis_name.lower()

        if gnis_name == stream_clean:
            gnis_cert = 1
            
        #do not want name of main stem to be fuzzy matched.  To avoid remove those names with tributary or branch in them
        elif 'tributary' in stream_clean or 'branch' in stream_clean:
            gnis_cert = 0
            
        else:
            match_ratio = difflib.SequenceMatcher(lambda x: x == " ",stream_clean, gnis_name).ratio()
            
            # From Python Documentation (https://docs.python.org/3/library/difflib.html):
            #"As a rule of thumb, a ratio() value over 0.6 means the sequences are close matches:"
            # At some point we should validate this rule of thumb but figured this is a good starting place for a lower limit
            if match_ratio >= 0.75:
                gnis_cert = 1
                
            #this is likely a match but less certain
            elif 0.75 > match_ratio >= 0.6:
                gnis_cert = 0.5
              
            else:
                gnis_cert = 0
                
                
     #else no stream name is supplied for one or both datasets, therefor stream name does not help improve certainty
    else:
        gnis_cert = 0
    return gnis_cert
                       
def mr_certainty(stream_clean, gnis_name_mr2, snap_dist_mr2):
    
    gnis_cert_mr2 = stream_name_match(stream_clean, gnis_name_mr2)
    
    
    #--------------------------------------------------------------------   
    #Distance between original lat/lon and reach/measure linkage
    #Note: snapDist is returned in Kilometers
    if snap_dist_mr2 or snap_dist_mr2==0:

        if snap_dist_mr2 == -999:
            dist_cert_mr2 = 0

        elif snap_dist_mr2 <=0.050:
            dist_cert_mr2 = 1

        elif 0.200<= snap_dist_mr2 <0.050:
            dist_cert_mr2 = 0.5

        else:
            dist_cert_mr2 = 0

    else:
        dist_cert_mr2 = 0
        
        
    #ADD CODE TO CHECK DISTANCE TO CLOSEST CONFLUENCE, if less than ?50m? flag as needs visual confirmation
    return gnis_cert_mr2, dist_cert_mr2

#Attempts to give a level of certainty based on:
#1. supplied stream name vs. gnis name of hydrolinked reach  (this could use some work)
#2-3 coming soon, see notes below
def hr_certainty(stream_clean, gnis_name_hr):
    
    gnis_cert_hr = stream_name_match(stream_clean, gnis_name_hr)

    #--------------------------------------------------------------------   
    #Distance between original lat/lon and reach/measure linkage isn't currently supplied by HR service although HR
    #team is aware of this and plan to add that to the service.

    return gnis_cert_hr
        

In [2]:
#Import packages needed for code to run
import requests
import json
import geopandas as gp
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

In [3]:
#Set variables needed to run the code


#User inputs variables needed for code to run
input_file = input("Enter file name, including extension (only accepts .csv and .shp): ")
latitude_field = input("Enter field name for latitude, note this is case sensitive: ")
longitude_field = input("Enter field name for longitude, note this is case sensitive: ")
stream_name_field = input("Enter field name for stream name, note this is case sensitive: ")
identifier_field = input("Enter field name for identifier, note this is case sensitive: ")

run_on = input("Run batch hydrolink on: 'nhdplusv2', 'nhdhr', 'both'")

#Alternative to user input, information can be hard coded here
#input_file = 'testData/test2.shp'
#latitude_field = 'DamLatitud'
#longitude_field = 'DamLongitu'
#stream_name_field = 'DamRiverNa'
#identifier_field = 'DamName'

#run_on = 'both'


Enter file name, including extension (only accepts .csv and .shp): testData/test2.shp
Enter field name for latitude, note this is case sensitive: DamLatitud
Enter field name for longitude, note this is case sensitive: DamLongitu
Enter field name for stream name, note this is case sensitive: DamRiverNa
Enter field name for identifier, note this is case sensitive: DamName
Run batch hydrolink on: 'nhdplusv2', 'nhdhr', 'both'Both


In [4]:
#Set variable list to store output data as it is processed
out_data = []

#
df = import_file(input_file, latitude_field, longitude_field, stream_name_field, identifier_field)

if run_on.lower() == 'nhdhr':
    for row in df.itertuples():
        
        #Define variables based on row,field values
        lat = row.lat
        lon = row.lon
        input_id = row.id
        stream = str(row.stream)
        
        print ('working on : ' + str(input_id))
                
        reachcode_hr,meas_hr,smdate_hr,perm_id_hr,xy_hr,gnis_name_hr = hydrolink_hr(lat,lon,input_id)
        stream_clean = clean_stream_name(stream)
        gnis_cert_hr = hr_certainty(stream_clean, gnis_name_hr)
        
        #record data in outData, this will be used to create dataframe
        out_data.append({"id":input_id,"reach_hr":reachcode_hr,"meas_hr":meas_hr,"smdate_hr":smdate_hr, "perm_id_hr": perm_id_hr, "xy_hr":xy_hr, "gnis_name_hr":gnis_name_hr, "gnis_cert_hr":gnis_cert_hr })

elif run_on.lower() == 'nhdplusv2':
    for row in df.itertuples():

        #Define variables based on row,field values
        lat = row.lat
        lon = row.lon
        input_id = row.id
        stream = str(row.stream)

        print ('working on : ' + str(input_id))

        snap_dist_mr2,xy_mr2,comid_mr2,reach_mr2,meas_mr2,gnis_name_mr2 = hydrolink_mr2(lat,lon,input_id)
        stream_clean = clean_stream_name(stream)
        gnis_cert_mr2, dist_cert_mr2 = mr_certainty(stream_clean, gnis_name_mr2, snap_dist_mr2)
        out_data.append({"id":input_id,"reach_mr2":reach_mr2,"meas_mr2":meas_mr2,"comid_mr2":comid_mr2,"xy_mr2":xy_mr2,"gnis_cert_mr2":gnis_cert_mr2, "dist_cert_mr2":dist_cert_mr2, "gnis_name_mr2": gnis_name_mr2})
        
elif run_on.lower() == 'both':
    for row in df.itertuples():
        
        #Define variables based on row,field values
        lat = row.lat
        lon = row.lon
        input_id = row.id
        stream = str(row.stream)
        
        print ('working on : ' + str(input_id))
                
        reachcode_hr,meas_hr,smdate_hr,perm_id_hr,xy_hr,gnis_name_hr = hydrolink_hr(lat,lon,input_id)
        snap_dist_mr2,xy_mr2,comid_mr2,reach_mr2,meas_mr2,gnis_name_mr2 = hydrolink_mr2(lat,lon,input_id)
        stream_clean = clean_stream_name(stream)
        gnis_cert_hr = hr_certainty(stream_clean, gnis_name_hr)
        gnis_cert_mr2, dist_cert_mr2 = mr_certainty(stream_clean, gnis_name_mr2, snap_dist_mr2)
        out_data.append({"id":input_id,"reach_hr":reachcode_hr,"meas_hr":meas_hr,"smdate_hr":smdate_hr, "perm_id_hr": perm_id_hr, "xy_hr":xy_hr, "gnis_name_hr":gnis_name_hr, "gnis_cert_hr":gnis_cert_hr,"reach_mr2":reach_mr2,"meas_mr2":meas_mr2,"comid_mr2":comid_mr2,"xy_mr2":xy_mr2,"gnis_cert_mr2":gnis_cert_mr2, "dist_cert_mr2":dist_cert_mr2, "gnis_name_mr2": gnis_name_mr2 })
else:
    print ('''Process failed. Please restart process and ensure to specify which version(s) of nhd to run against.  Use only the three options provided: "nhdplusv2", "nhdhr", "both" ''')

    
#Create Dataframe with hydro-link data        
out_data_df = pd.DataFrame(out_data)


reading shapefile

working on : New River Light and Power Dam
working on : Fifth Ave Dam
working on : Homestead Dam
working on : Kentchurch Dam
working on : Main Street Dam
working on : Marvel Slab Dam
working on : Off-Billington Street Dam
working on : Shull's Mill Dam
working on : Lower Flume Dam
working on : Fell Spice Mill Dam
working on : Upper Flume Dam
working on : Great Works Dam
working on : Hemlock Dam
working on : Thompson Dam
working on : Mendaraz Dam
working on : Spruce Pine Dam
working on : Pelham Dam (Bartlett Fishrod Company Dam)
working on : Birch Run Dam
working on : San Clemente Dam
working on : Veazie Dam
working on : New Way Dam
working on : Robbins Dike Dam
working on : Inturia Dam


#### Note: Notice in the cell below that Homestead Dam links to two different stream names for NHD High Resolution (HR) ('California Brook) and NHDPlusV2 ('Ashuelot River').  This is a good indication that one of these is incorrect.  If you look at the certainty values you see that the NHDPlusV2 linkage is more likely to be correct (gnis_cert_mr2 = 1 and dist_cert_mr2=1) where the HR has lowest level of certainty (gnis_cert_hr = 0).  In this case you could bring up this record in The Hydrolink Tool (https://maps.usgs.gov/hydrolink), Google Earth, or other tools allowing you to visualize aerial imagery and geospatial information to help better understand which, if either, of the linkages are correct.

In [5]:
out_data_df

Unnamed: 0,comid_mr2,dist_cert_mr2,gnis_cert_hr,gnis_cert_mr2,gnis_name_hr,gnis_name_mr2,id,meas_hr,meas_mr2,perm_id_hr,reach_hr,reach_mr2,smdate_hr,xy_hr,xy_mr2
0,6892046,1,1.0,1.0,South Fork New River,South Fork New River,New River Light and Power Dam,90.9711,12.98281,38f8b2f8-3a4c-4fae-8818-ae654cc4d96d,05050001000438,05050001000438,1330560000000.0,"{'x': -81.65131131703453, 'y': 36.206204077134...","[-81.6461351020147, 36.2090409025584]"
1,5218153,1,0.5,0.5,Olentangy River,Olentangy River,Fifth Ave Dam,96.08017,43.66452,c39b58cb-5cb4-4150-8285-5921193b1986,05060001000211,05060001000211,987465600000.0,"{'x': -83.01631471491578, 'y': 40.016446271219...","[-83.0244554424495, 39.9887805191651]"
2,6778181,1,0.0,1.0,California Brook,Ashuelot River,Homestead Dam,11.92958,38.59254,ab205ba3-fcdc-4b4d-869d-ddd0727710ca,01080201001052,01080201000081,936576000000.0,"{'x': -72.33851193148985, 'y': 42.866041200129...","[-72.3280017934781, 42.8716649453981]"
3,NO COMID,0,0.0,0.0,,,Kentchurch Dam,-999.0,-999.0,NO PERM ID,NO REACHCODE,NO REACHCODE,,-999,POINT(-2.865927 51.92675)
4,5218161,1,1.0,1.0,Scioto River,Scioto River,Main Street Dam,19.76997,78.34369,02157b51-4d57-4faf-928b-5038bb3249ff,05060001005352,05060001002526,1482240294000.0,"{'x': -83.00187538160486, 'y': 39.895869804740...","[-83.0076984876497, 39.9556309896144]"
5,21658128,1,0.0,1.0,,Cahaba River,Marvel Slab Dam,46.63952,52.65591,1b65f7a6-ae8b-4092-8e27-45e8fe7ff128,03150202004290,03150202003380,1330646400000.0,"{'x': -87.01924877536908, 'y': 33.160842481861...","[-87.0293324538523, 33.1651026027221]"
6,5878571,1,0.0,1.0,,Town Brook,Off-Billington Street Dam,80.16844,53.61827,3ddbf8ec-068d-4a28-9684-79d2ae2f6ffe,01090002023089,01090002001624,1331078400000.0,"{'x': -70.66999320074638, 'y': 41.955515068209...","[-70.6737186376754, 41.9489482229312]"
7,19743560,1,0.0,1.0,,Watauga River,Shull's Mill Dam,46.731,73.13481,cd2d85b3-ee97-49ec-84a7-e3d07a57878c,06010103005975,06010103000155,1324252800000.0,"{'x': -81.76012638353228, 'y': 36.184226610501...","[-81.7470035031778, 36.1831559685481]"
8,NO COMID,0,1.0,0.0,Red Brook,,Lower Flume Dam,50.79394,-999.0,34cdfa47-9f0e-4e68-999d-08f6e62324bc,01090002025256,NO REACHCODE,1331078400000.0,"{'x': -70.63545906746668, 'y': 41.767723868501...",POINT(-70.634066 41.765246)
9,4651912,1,1.0,1.0,Red Clay Creek,Red Clay Creek,Fell Spice Mill Dam,68.01146,21.38402,fe097fb9-7b58-4506-ab35-e9637dbef053,02040205000108,02040205000108,1331164800000.0,"{'x': -75.64159159302949, 'y': 39.762548404947...","[-75.6327053822628, 39.7500695852434]"


In [6]:
out_file = input("Name of output file. No extension:  ")
out_file = out_file + ".csv"
out_data_df.to_csv(out_file)

Name of output file. No extension:  final_data
