# Batch HydroLink 
### Developed By: Daniel Wieferich (USGS)
### 20171005
### Last updated: 20191113

#### Note: This code is a work in progress.  Any suggestions and comments are welcomed.


#### This code performs a hydrolink batch process on a text file (.csv ) or shapefile (.shp) using user defined latitude and longitude fields.  The code returns the reachcode and measure of the closest position on the High Resolution National Hydrography Dataset, the Medium Resolution NHDPlusV2.1, or both versions of NHD using web services (no downloading of NHD data is necessary).  The code also uses snap distance and stream name to help quantify a level of certainty.  Levels of certainty are a work in progress.

#### Next steps: 
1. Work with the NHD High Resolution team to see if additional information can be returned from services to help with assigning more levels of certainty. (in progress)
2. It would be great to get a measure of distance to closest confluence as a measure of certainty.  Through manual linkage in the past I've noticed that points within 50m of a confluence are more likely to link to the wrong reach.
3. The current code assumes lat/lon are in NAD83.  The code should be altered to accept various crs inputs.
4. Improve upon error checking and notifications
5. Allow for join of hydro-linked data to original dataset?

In [23]:
import requests
import geopandas as gpd
from shapely.geometry import Point, LineString

class HrHydroLink:
    def __init__(self, feature_id, init_lat, init_lon, stream_name):
        '''
        Description: Initiates High Resolution variables
        
        Input: x,y coordinate in NAD83 (CRS 4269)
        '''
        self.id = feature_id
        self.init_lat = init_lat
        self.init_lon = init_lon
        self.stream_name = stream_name
        self.message = ''
        self.get_hr_reach = 'https://edits.nationalmap.gov/arcgis/rest/services/HEM/NHDHigh/MapServer/exts/Vwe_HEM_Soe/HEMGetReachcodeFromXY'
        self.get_hr_xy = 'https://edits.nationalmap.gov/arcgis/rest/services/HEM/NHDHigh/MapServer/exts/Vwe_HEM_Soe/HEMPointEvents'
        self.hl_reachcode = None
        self.hl_meas = None
        self.hl_smdate = None
        self.hl_perm_id = None
        self.hl_lat = None
        self.hl_lon = None
        self.hl_gnis_name = None
        self.hl_snap_meters = None
        self.meters_to_node = None
        self.reach_info = None
        self.hr = None
        
        
    def hydrolink_hr(self):
        '''
        Description: HydroLink point location to the NHD High Resolution using SOE services. Uses similar methods as The HydroLink Tool (USGS)

        Input: x,y coordinate in NAD83 (CRS 4269)
        '''

        #Structure original lat/lon into format needed for HEM SOE
        xy = '{"x":' + str(self.init_lon) + ',"y":' + str(self.init_lat) + ', "spatialReference": {"wkid":4269}}'

        if self.init_lat and (float(self.init_lat) > 24 and float(self.init_lat)<50) and self.init_lon and (float(self.init_lon) < -66 and float(self.init_lon)> -125):
            payload = {
                "point": xy ,
                "selectionLayerName": "NHDFLOWLINE",
                "selectionType": "TOPDOWNSTREAM",
                "searchToleranceMeters": 1500,
                "outWKID": 4269,
                "f": "json"}


            try:
                #Connects to web service
                r = requests.post(self.get_hr_reach,params=payload,verify=False).json()
            except:
                self.message='get_hr_reach service call failed'

            if r['resultStatus'] == 'success' and r['features']:
                self.reach_info = r['features'][0]
                self.hl_reachcode = self.reach_info['attributes']['REACHCODE']
                payload2 = {
                  "point": xy ,
                  "reachcode": self.hl_reachcode,
                  "searchToleranceMeters": 1500,
                  "outWKID": 4269,
                  "f": "json"}

                self.hr_xy = requests.post(self.get_hr_xy,params=payload2,verify=False).json()
                try:
                    self.hl_meas = self.hr_xy['features'][0]['attributes']['MEASURE']
                    self.hl_smdate = self.hr_xy['features'][0]['attributes']['REACHSMDATE']
                    self.hl_perm_id = self.hr_xy['features'][0]['attributes']['PERMANENT_IDENTIFIER']
                    self.hl_lat = self.hr_xy['features'][0]['geometry']['y']
                    self.hl_lon = self.hr_xy['features'][0]['geometry']['x']
                    self.hl_gnis_name = self.hr_xy['features'][0]['attributes']['GNIS_NAME']

                except:
                    self.message='get_hr_xy service call failed'
        
        else:
            self.message='initial coordinates outside of U.S.'
            

    def distance_measures(self):
        '''
        Description: Gets two measures (in meters)
        Note: Should likely break these out into 2 functions
        '''
        
        if hasattr(self, 'hr_xy') and self.hr_xy['resultStatus']=='success':
            snap_line_geom = LineString([Point(self.hl_lon,self.hl_lat), Point(self.init_lon, self.init_lat)])
            geos = gpd.GeoSeries(snap_line_geom)
            geos.crs = {'init':'epsg:4269'}
            geos=geos.to_crs({'init':'epsg:5070'})
            self.hl_snap_meters = geos.length[0]
            
            if self.reach_info['geometry']['hasM']==True:
                path = self.reach_info['geometry']['paths']
                for node in path[0]:
                    if node[3]==0 or node[3]==100:
                        line_geom = LineString([Point(node[0],node[1]), Point(self.init_lon, self.init_lat)])
                        geos = gpd.GeoSeries(line_geom)
                        geos.crs = {'init':'epsg:4269'}
                        geos=geos.to_crs({'init':'epsg:5070'})
                        meters_to = geos.length[0]
                        if self.meters_to_node is None:
                            self.meters_to_node = meters_to
                        else:
                            if self.meters_to_node > meters_to:
                                self.meters_to_node = meters_to
                                
    def clean_stream_name(self):
        '''
        Description: replace common abbreviations, this needs improvement but be careful not to replace 
        strings we dont want to this code currently assumes GNIS_NAME never contains abbreviations... 
        something to verify. If you have a better way to do this let me know!!!!
        '''
        stream = self.stream_name
        stream_lower = stream.lower()
        stream_lower = stream_lower.replace(' st ', ' stream')
        stream_lower = stream_lower.replace(' st.', ' stream')
        stream_lower = stream_lower.replace(' rv', ' river')
        stream_lower = stream_lower.replace(' rv.', ' river')
        stream_lower = stream_lower.replace(' trib.', ' tributary')
        stream_lower = stream_lower.replace(' trib)', ' tributary')
        stream_lower = stream_lower.replace(' trib ', ' tributary')
        stream_lower = stream_lower.replace(' ck', ' creek')
        stream_lower = stream_lower.replace(' ck.', ' creek')
        stream_lower = stream_lower.replace(' br ', ' branch')
        stream_lower = stream_lower.replace(' br.', ' branch')
        self.stream_name = stream_lower
        print (self.stream_name)                         

In [24]:
import pandas as pd
import geopandas as gpd
import requests


def import_file(input_file, latitude_field, longitude_field, identifier_field, stream_name_field):
    '''
    Description: Imports CSV or SHP files into a Pandas dataframe. Makes sure that all fields 
    are in the file and converts the file into a dataframe with a standard set of field names.
    
    Input: CSV or Shapefile
    
    Output: Pandas Dataframe
    '''
    #Check to see if the file is a CSV file
    if input_file.endswith('.csv'):
        print ('\n' + 'reading csv file' +'\n')
        try:
            df = pd.read_csv(input_file)
        except KeyError:
            print ('file did not properly import, verify file name and rerun')
            #It would be nice here to reask for inputFileName and then restart at try statement
    
    #If input file is not a CSV check to see if it is a shapefile
    elif input_file.endswith('.shp'):
        print('\n' + 'reading shapefile' + '\n')
        try:
            df = gp.GeoDataFrame.from_file(input_file)
        except KeyError:
            print ('file did not properly import, verify file name and rerun')
            #It would be nice here to reask for inputFileName and then restart at try statement
    
    #If input file is not a CSV or shapefile tell the user that the file type is not excepted
    else:
        print('File type not currently accepted. Please try .csv or .shp')
        
    if latitude_field in df and longitude_field in df and stream_name_field in df and identifier_field in df:
        df = df[[identifier_field, latitude_field, longitude_field, stream_name_field]].copy()
        df = df.rename(columns={identifier_field: 'id', latitude_field: 'lat', longitude_field: 'lon', stream_name_field: 'stream'})
        return df
    else: 
        print ('verify field names and rerun')
        
        


    
def stream_name_match(stream_clean, gnis_name):
    import difflib
    
    #Stream Name Match
    if stream_clean and gnis_name:
        gnis_name = gnis_name.lower()

        if gnis_name == stream_clean:
            gnis_cert = 1
            
        #do not want name of main stem to be fuzzy matched.  To avoid remove those names with tributary or branch in them
        elif 'tributary' in stream_clean or 'branch' in stream_clean:
            gnis_cert = 0
            
        else:
            match_ratio = difflib.SequenceMatcher(lambda x: x == " ",stream_clean, gnis_name).ratio()
            
            # From Python Documentation (https://docs.python.org/3/library/difflib.html):
            #"As a rule of thumb, a ratio() value over 0.6 means the sequences are close matches:"
            # At some point we should validate this rule of thumb but figured this is a good starting place for a lower limit
            if match_ratio >= 0.75:
                gnis_cert = 1
                
            #this is likely a match but less certain
            elif 0.75 > match_ratio >= 0.6:
                gnis_cert = 0.5
              
            else:
                gnis_cert = 0
                
                
     #else no stream name is supplied for one or both datasets, therefor stream name does not help improve certainty
    else:
        gnis_cert = 0
    return gnis_cert
                       

#Attempts to give a level of certainty based on:
#1. supplied stream name vs. gnis name of hydrolinked reach  (this could use some work)
#2-3 coming soon, see notes below
def hr_name_certainty(stream_clean, gnis_name_hr):
    
    gnis_cert_hr = stream_name_match(stream_clean, gnis_name_hr)

    #--------------------------------------------------------------------   
    #Distance between original lat/lon and reach/measure linkage isn't currently supplied by HR service although HR
    #team is aware of this and plan to add that to the service.

    return gnis_cert_hr
        

In [25]:
#Import packages needed for code to run
import warnings

warnings.filterwarnings("ignore")


#Set variables needed to run the code

#User inputs variables needed for code to run
#input_file = input("Enter file name, including extension (only accepts .csv and .shp): ")
#latitude_field = input("Enter field name for latitude, note this is case sensitive: ")
#longitude_field = input("Enter field name for longitude, note this is case sensitive: ")
#stream_name_field = input("Enter field name for stream name, note this is case sensitive: ")
#identifier_field = input("Enter field name for identifier, note this is case sensitive: ")

#Alternative to user input, information can be hard coded here
input_file = 'test_co.csv'
latitude_field = 'y'
longitude_field = 'x'
stream_name_field = 'stream'
identifier_field = 'id'



In [26]:
#Set variable list to store output data as it is processed
out_data = []

df = import_file(input_file, latitude_field, longitude_field, identifier_field,stream_name_field)   

for row in df.itertuples():
    #Define variables based on row,field values
    lat = float(row.lat)
    lon = float(row.lon)
    input_id = row.id
    stream = str(row.stream)

    print ('working on : ' + str(input_id))

    item = HrHydroLink(input_id,lat,lon,stream)
    item.hydrolink_hr()
    item.distance_measures()
    item.clean_stream_name()

    #record data in outData, this will be used to create dataframe
    out_data.append({"id":input_id,"reach":item.hl_reachcode,"meas":item.hl_meas,"smdate":item.hl_smdate, "perm_id": item.hl_perm_id, "hl_lat":item.hl_lat, "hl_lon":item.hl_lon, "gnis_name":item.hl_gnis_name, "snap_meters":item.hl_snap_meters, "meters_to_node":item.meters_to_node,  "init_stream_nm":stream, "init_lat":lat ,"init_lon":lon,"init_stream":item.stream_name })



reading csv file

working on : 1
nan
working on : 2
clear creek
working on : 3
michigan


In [27]:
out_data

[{'id': 1,
  'reach': '10190004002236',
  'meas': 100,
  'smdate': 1329523200000,
  'perm_id': '3ba78374-0427-4fea-9151-71b30ab3ddde',
  'hl_lat': 39.73669080498735,
  'hl_lon': -105.17110408052713,
  'gnis_name': '',
  'snap_meters': 1166.4157467170157,
  'meters_to_node': 1166.4157467170157,
  'init_stream_nm': 'nan',
  'init_lat': 39.736459,
  'init_lon': -105.18483400000001,
  'init_stream': 'nan'},
 {'id': 2,
  'reach': '10190004002091',
  'meas': 0,
  'smdate': 1329523200000,
  'perm_id': 'f54e5db2-d0df-4c83-aad5-4937dde366a1',
  'hl_lat': 39.732607471660344,
  'hl_lon': -105.53436487996328,
  'gnis_name': 'Chicago Creek',
  'snap_meters': 1680.6801905211519,
  'meters_to_node': 1680.6801905211519,
  'init_stream_nm': 'clear ck',
  'init_lat': 39.740534000000004,
  'init_lon': -105.51756599999999,
  'init_stream': 'clear creek'},
 {'id': 3,
  'reach': '10190001000066',
  'meas': 63.20026,
  'smdate': 1451822201000,
  'perm_id': '25c23a8e-f064-4a50-91b0-57aa562e1c0f',
  'hl_lat': 

In [28]:
item.__dict__

{'id': 3,
 'init_lat': 39.381318,
 'init_lon': -105.85035400000001,
 'stream_name': 'michigan',
 'message': '',
 'get_hr_reach': 'https://edits.nationalmap.gov/arcgis/rest/services/HEM/NHDHigh/MapServer/exts/Vwe_HEM_Soe/HEMGetReachcodeFromXY',
 'get_hr_xy': 'https://edits.nationalmap.gov/arcgis/rest/services/HEM/NHDHigh/MapServer/exts/Vwe_HEM_Soe/HEMPointEvents',
 'hl_reachcode': '10190001000066',
 'hl_meas': 63.20026,
 'hl_smdate': 1451822201000,
 'hl_perm_id': '25c23a8e-f064-4a50-91b0-57aa562e1c0f',
 'hl_lat': 39.39855887217885,
 'hl_lon': -105.86901447944382,
 'hl_gnis_name': 'Michigan Creek',
 'hl_snap_meters': 2503.9255190823346,
 'meters_to_node': 586.3481636576274,
 'reach_info': {'geometry': {'hasZ': True,
   'hasM': True,
   'paths': [[[-105.85114301280487, 39.37960547220831, 0, 8.98344],
     [-105.8510156128051, 39.37956160554171, 0, 8.77281],
     [-105.85078201280544, 39.37948480554178, 0, 8.38942],
     [-105.8506758128056, 39.37936940554198, 0, 8.1133],
     [-105.850604

'clear creek'