-
Notifications
You must be signed in to change notification settings - Fork 808
/
core.py
1548 lines (1268 loc) · 66 KB
/
core.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
###################################################################################################
# Module: core.py
# Description: Retrieve and construct spatial geometries and street networks from OpenStreetMap
# License: MIT, see full license in LICENSE.txt
# Web: https://github.com/gboeing/osmnx
###################################################################################################
import math
import re
import time
import json
import io
import os
import hashlib
import datetime as dt
import logging as lg
import requests
import numpy as np
import pandas as pd
import geopandas as gpd
import networkx as nx
from collections import OrderedDict
from itertools import groupby
from dateutil import parser as date_parser
from shapely.geometry import Point, LineString, Polygon, MultiPolygon
from shapely.ops import unary_union
from . import globals
from .utils import log, make_str, get_largest_component, great_circle_vec, get_nearest_node, geocode
from .simplify import simplify_graph
from .projection import project_geometry, project_gdf
from .stats import count_streets_per_node
def save_to_cache(url, response_json):
"""
Save an HTTP response json object to the cache.
If the request was sent to server via POST instead of GET,
then URL should be a GET-style representation of request. Users should always pass OrderedDicts instead of dicts
of parameters into request functions, so that the parameters stay in the same order each time, producing the same URL string,
and thus the same hash. Otherwise the cache will eventually contain multiple saved responses for the same
request because the URL's parameters appeared in a different order each time.
Parameters
----------
url : string
the url of the request
response_json : dict
the json response
Returns
-------
None
"""
if globals.use_cache:
if response_json is None:
log('Saved nothing to cache because response_json is None')
else:
# create the folder on the disk if it doesn't already exist
if not os.path.exists(globals.cache_folder):
os.makedirs(globals.cache_folder)
# hash the url (to make filename shorter than the often extremely long url)
filename = hashlib.md5(url.encode('utf-8')).hexdigest()
cache_path_filename = '{}/{}.json'.format(globals.cache_folder, filename)
# dump to json, and save to file
json_str = make_str(json.dumps(response_json))
with io.open(cache_path_filename, 'w', encoding='utf-8') as cache_file:
cache_file.write(json_str)
log('Saved response to cache file "{}"'.format(cache_path_filename))
def get_from_cache(url):
"""
Retrieve a HTTP response json object from the cache.
Parameters
----------
url : string
the url of the request
Returns
-------
response_json : dict
"""
# if the tool is configured to use the cache
if globals.use_cache:
# determine the filename by hashing the url
filename = hashlib.md5(url.encode('utf-8')).hexdigest()
cache_path_filename = '{}/{}.json'.format(globals.cache_folder, filename)
# open the cache file for this url hash if it already exists, otherwise return None
if os.path.isfile(cache_path_filename):
response_json = json.load(io.open(cache_path_filename, encoding='utf-8'))
log('Retrieved response from cache file "{}" for URL "{}"'.format(cache_path_filename, url))
return response_json
def get_pause_duration(recursive_delay=5, default_duration=10):
"""
Check the Overpass API status endpoint to determine how long to wait until next slot is available.
Parameters
----------
recursive_delay : int
how long to wait between recursive calls if server is currently running a query
default_duration : int
if fatal error, function falls back on returning this value
Returns
-------
int
"""
try:
response = requests.get('http://overpass-api.de/api/status')
status = response.text.split('\n')[3]
status_first_token = status.split(' ')[0]
except:
# if we cannot reach the status endpoint or parse its output, log an error and return default duration
log('Unable to query http://overpass-api.de/api/status', level=lg.ERROR)
return default_duration
try:
# if first token is numeric, it's how many slots you have available - no wait required
available_slots = int(status_first_token)
pause_duration = 0
except:
# if first token is 'Slot', it tells you when your slot will be free
if status_first_token == 'Slot':
utc_time_str = status.split(' ')[3]
utc_time = date_parser.parse(utc_time_str).replace(tzinfo=None)
pause_duration = math.ceil((utc_time - dt.datetime.utcnow()).total_seconds())
pause_duration = max(pause_duration, 1)
# if first token is 'Currently', it is currently running a query so check back in recursive_delay seconds
elif status_first_token == 'Currently':
time.sleep(recursive_delay)
pause_duration = get_pause_duration()
else:
# any other status is unrecognized - log an error and return default duration
log('Unrecognized server status: "{}"'.format(status), level=lg.ERROR)
return default_duration
return pause_duration
def nominatim_request(params, pause_duration=1, timeout=30, error_pause_duration=180):
"""
Send a request to the Nominatim API via HTTP GET and return the JSON response.
Parameters
----------
params : dict or OrderedDict
key-value pairs of parameters
pause_duration : int
how long to pause before requests, in seconds
timeout : int
the timeout interval for the requests library
error_pause_duration : int
how long to pause in seconds before re-trying requests if error
Returns
-------
response_json : dict
"""
# prepare the Nominatim API URL and see if request already exists in the cache
url = 'https://nominatim.openstreetmap.org/search'
prepared_url = requests.Request('GET', url, params=params).prepare().url
cached_response_json = get_from_cache(prepared_url)
if not cached_response_json is None:
# found this request in the cache, just return it instead of making a new HTTP call
return cached_response_json
else:
# if this URL is not already in the cache, pause, then request it
log('Pausing {:,.2f} seconds before making API GET request'.format(pause_duration))
time.sleep(pause_duration)
start_time = time.time()
log('Requesting {} with timeout={}'.format(prepared_url, timeout))
response = requests.get(url, params=params, timeout=timeout)
# get the response size and the domain, log result
size_kb = len(response.content) / 1000.
domain = re.findall(r'//(?s)(.*?)/', url)[0]
log('Downloaded {:,.1f}KB from {} in {:,.2f} seconds'.format(size_kb, domain, time.time()-start_time))
try:
response_json = response.json()
save_to_cache(prepared_url, response_json)
except:
#429 is 'too many requests' and 504 is 'gateway timeout' from server overload - handle these errors by recursively calling nominatim_request until we get a valid response
if response.status_code in [429, 504]:
# pause for error_pause_duration seconds before re-trying request
log('Server at {} returned status code {} and no JSON data. Re-trying request in {:.2f} seconds.'.format(domain, response.status_code, error_pause_duration), level=lg.WARNING)
time.sleep(error_pause_duration)
response_json = nominatim_request(params=params, pause_duration=pause_duration, timeout=timeout)
# else, this was an unhandled status_code, throw an exception
else:
log('Server at {} returned status code {} and no JSON data'.format(domain, response.status_code), level=lg.ERROR)
raise Exception('Server returned no JSON data.\n{} {}\n{}'.format(response, response.reason, response.text))
return response_json
def overpass_request(data, pause_duration=None, timeout=180, error_pause_duration=None):
"""
Send a request to the Overpass API via HTTP POST and return the JSON response.
Parameters
----------
data : dict or OrderedDict
key-value pairs of parameters to post to the API
pause_duration : int
how long to pause in seconds before requests, if None, will query API status endpoint to find when next slot is available
timeout : int
the timeout interval for the requests library
error_pause_duration : int
how long to pause in seconds before re-trying requests if error
Returns
-------
dict
"""
# define the Overpass API URL, then construct a GET-style URL as a string to hash to look up/save to cache
url = 'http://www.overpass-api.de/api/interpreter'
prepared_url = requests.Request('GET', url, params=data).prepare().url
cached_response_json = get_from_cache(prepared_url)
if not cached_response_json is None:
# found this request in the cache, just return it instead of making a new HTTP call
return cached_response_json
else:
# if this URL is not already in the cache, pause, then request it
if pause_duration is None:
this_pause_duration = get_pause_duration()
log('Pausing {:,.2f} seconds before making API POST request'.format(this_pause_duration))
time.sleep(this_pause_duration)
start_time = time.time()
log('Posting to {} with timeout={}, "{}"'.format(url, timeout, data))
response = requests.post(url, data=data, timeout=timeout)
# get the response size and the domain, log result
size_kb = len(response.content) / 1000.
domain = re.findall(r'//(?s)(.*?)/', url)[0]
log('Downloaded {:,.1f}KB from {} in {:,.2f} seconds'.format(size_kb, domain, time.time()-start_time))
try:
response_json = response.json()
if 'remark' in response_json:
log('Server remark: "{}"'.format(response_json['remark'], level=lg.WARNING))
save_to_cache(prepared_url, response_json)
except:
#429 is 'too many requests' and 504 is 'gateway timeout' from server overload - handle these errors by recursively calling overpass_request until we get a valid response
if response.status_code in [429, 504]:
# pause for error_pause_duration seconds before re-trying request
if error_pause_duration is None:
error_pause_duration = get_pause_duration()
log('Server at {} returned status code {} and no JSON data. Re-trying request in {:.2f} seconds.'.format(domain, response.status_code, error_pause_duration), level=lg.WARNING)
time.sleep(error_pause_duration)
response_json = overpass_request(data=data, pause_duration=pause_duration, timeout=timeout)
# else, this was an unhandled status_code, throw an exception
else:
log('Server at {} returned status code {} and no JSON data'.format(domain, response.status_code), level=lg.ERROR)
raise Exception('Server returned no JSON data.\n{} {}\n{}'.format(response, response.reason, response.text))
return response_json
def osm_polygon_download(query, limit=1, polygon_geojson=1, pause_duration=1):
"""
Geocode a place and download its boundary geometry from OSM's Nominatim API.
Parameters
----------
query : string or dict
query string or structured query dict to geocode/download
limit : int
max number of results to return
polygon_geojson : int
request the boundary geometry polygon from the API, 0=no, 1=yes
pause_duration : int
time in seconds to pause before API requests
Returns
-------
dict
"""
# define the parameters
params = OrderedDict()
params['format'] = 'json'
params['limit'] = limit
params['dedupe'] = 0 #this prevents OSM from de-duping results so we're guaranteed to get precisely 'limit' number of results
params['polygon_geojson'] = polygon_geojson
# add the structured query dict (if provided) to params, otherwise query with place name string
if isinstance(query, str):
params['q'] = query
elif isinstance(query, dict):
# add the query keys in alphabetical order so the URL is the same string each time, for caching purposes
for key in sorted(list(query.keys())):
params[key] = query[key]
else:
raise ValueError('query must be a dict or a string')
# request the URL, return the JSON
response_json = nominatim_request(params=params, timeout=30)
return response_json
def gdf_from_place(query, gdf_name=None, which_result=1, buffer_dist=None):
"""
Create a GeoDataFrame from a single place name query.
Parameters
----------
query : string or dict
query string or structured query dict to geocode/download
gdf_name : string
name attribute metadata for GeoDataFrame (this is used to save shapefile later)
which_result : int
max number of results to return and which to process upon receipt
buffer_dist : float
distance to buffer around the place geometry, in meters
Returns
-------
GeoDataFrame
"""
# if no gdf_name is passed, just use the query
assert (isinstance(query, dict) or isinstance(query, str)), 'query must be a dict or a string'
if (gdf_name is None) and isinstance(query, dict):
gdf_name = ', '.join(list(query.values()))
elif (gdf_name is None) and isinstance(query, str):
gdf_name = query
# get the data from OSM
data = osm_polygon_download(query, limit=which_result)
if len(data) >= which_result:
# extract data elements from the JSON response
result = data[which_result - 1]
bbox_south, bbox_north, bbox_west, bbox_east = [float(x) for x in result['boundingbox']]
geometry = result['geojson']
place = result['display_name']
features = [{'type': 'Feature',
'geometry': geometry,
'properties': {'place_name': place,
'bbox_north': bbox_north,
'bbox_south': bbox_south,
'bbox_east': bbox_east,
'bbox_west': bbox_west}}]
# if we got an unexpected geometry type (like a point), log a warning
if geometry['type'] not in ['Polygon', 'MultiPolygon']:
log('OSM returned a {} as the geometry.'.format(geometry['type']), level=lg.WARNING)
# create the GeoDataFrame, name it, and set its original CRS to lat-long (EPSG 4326)
gdf = gpd.GeoDataFrame.from_features(features)
gdf.gdf_name = gdf_name
gdf.crs = {'init':'epsg:4326'}
# if buffer_dist was passed in, project the geometry to UTM, buffer it in meters, then project it back to lat-long
if not buffer_dist is None:
gdf_utm = project_gdf(gdf)
gdf_utm['geometry'] = gdf_utm['geometry'].buffer(buffer_dist)
gdf = project_gdf(gdf_utm, to_latlong=True)
log('Buffered the GeoDataFrame "{}" to {} meters'.format(gdf.gdf_name, buffer_dist))
# return the gdf
log('Created GeoDataFrame with {} row for query "{}"'.format(len(gdf), query))
return gdf
else:
# if there was no data returned (or fewer results than which_result specified)
log('OSM returned no results (or fewer than which_result) for query "{}"'.format(query), level=lg.WARNING)
gdf = gpd.GeoDataFrame()
gdf.gdf_name = gdf_name
return gdf
def gdf_from_places(queries, gdf_name='unnamed', buffer_dist=None):
"""
Create a GeoDataFrame from a list of place names to query.
Parameters
----------
queries : list
list of query strings or structured query dicts to geocode/download, one at a time
gdf_name : string
name attribute metadata for GeoDataFrame (this is used to save shapefile later)
buffer_dist : float
distance to buffer around the place geometry, in meters
Returns
-------
GeoDataFrame
"""
# create an empty GeoDataFrame then append each result as a new row
gdf = gpd.GeoDataFrame()
for query in queries:
gdf = gdf.append(gdf_from_place(query, buffer_dist=buffer_dist))
# reset the index, name the GeoDataFrame
gdf = gdf.reset_index().drop(labels='index', axis=1)
gdf.gdf_name = gdf_name
# set the original CRS of the GeoDataFrame to lat-long, and return it
gdf.crs = {'init':'epsg:4326'}
log('Finished creating GeoDataFrame with {} rows from {} queries'.format(len(gdf), len(queries)))
return gdf
def get_osm_filter(network_type):
"""
Create a filter to query OSM for the specified network type.
Parameters
----------
network_type : string
{'walk', 'bike', 'drive', 'drive_service', 'all', 'all_private'} what type of street network to get
Returns
-------
string
"""
filters = {}
# driving: filter out un-drivable roads, service roads, private ways, and anything specifying motor=no
# also filter out any non-service roads that are tagged as providing parking, driveway, private, or emergency-access services
filters['drive'] = ('["area"!~"yes"]["highway"!~"cycleway|footway|path|pedestrian|steps|track|'
'proposed|construction|bridleway|abandoned|platform|raceway|service"]'
'["motor_vehicle"!~"no"]["motorcar"!~"no"]["access"!~"private"]'
'["service"!~"parking|parking_aisle|driveway|private|emergency_access"]')
# drive+service: allow ways tagged 'service' but filter out certain types of service ways
filters['drive_service'] = ('["area"!~"yes"]["highway"!~"cycleway|footway|path|pedestrian|steps|track|'
'proposed|construction|bridleway|abandoned|platform|raceway"]'
'["motor_vehicle"!~"no"]["motorcar"!~"no"]["access"!~"private"]'
'["service"!~"parking|parking_aisle|private|emergency_access"]')
# walking: filter out cycle ways, motor ways, private ways, and anything specifying foot=no
# allow service roads, permitting things like parking lot lanes, alleys, etc that you *can* walk on even if they're not exactly nice walks
filters['walk'] = ('["area"!~"yes"]["highway"!~"cycleway|motor|proposed|construction|abandoned|platform|raceway"]'
'["foot"!~"no"]["service"!~"private"]["access"!~"private"]')
# biking: filter out foot ways, motor ways, private ways, and anything specifying biking=no
filters['bike'] = ('["area"!~"yes"]["highway"!~"footway|motor|proposed|construction|abandoned|platform|raceway"]'
'["bicycle"!~"no"]["service"!~"private"]["access"!~"private"]')
# to download all ways, just filter out everything not currently in use or that is private-access only
filters['all'] = ('["area"!~"yes"]["highway"!~"proposed|construction|abandoned|platform|raceway"]'
'["service"!~"private"]["access"!~"private"]')
# to download all ways, including private-access ones, just filter out everything not currently in use
filters['all_private'] = '["area"!~"yes"]["highway"!~"proposed|construction|abandoned|platform|raceway"]'
if network_type in filters:
osm_filter = filters[network_type]
else:
raise ValueError('unknown network_type "{}"'.format(network_type))
return osm_filter
def osm_net_download(polygon=None, north=None, south=None, east=None, west=None, network_type='all_private', timeout=180, memory=None, max_query_area_size=50*1000*50*1000):
"""
Download OSM ways and nodes within some bounding box from the Overpass API.
Parameters
----------
polygon : shapely Polygon or MultiPolygon
geographic shape to fetch the street network within
north : float
northern latitude of bounding box
south : float
southern latitude of bounding box
east : float
eastern longitude of bounding box
west : float
western longitude of bounding box
network_type : string
{'walk', 'bike', 'drive', 'drive_service', 'all', 'all_private'} what type of street network to get
timeout : int
the timeout interval for requests and to pass to API
memory : int
server memory allocation size for the query, in bytes. If none, server will use its default allocation size
max_query_area_size : float
max area for any part of the geometry, in the units the geometry is in: any polygon bigger will get divided up
for multiple queries to API (default is 50,000 * 50,000 units (ie, 50km x 50km in area, if units are meters))
Returns
-------
dict
"""
# check if we're querying by polygon or by bounding box based on which argument(s) where passed into this function
by_poly = not polygon is None
by_bbox = not (north is None or south is None or east is None or west is None)
if not (by_poly or by_bbox):
raise ValueError('You must pass a polygon or north, south, east, and west')
# create a filter to exclude certain kinds of ways based on the requested network_type
osm_filter = get_osm_filter(network_type)
response_jsons = []
# pass server memory allocation in bytes for the query to the API
# if None, pass nothing so the server will use its default allocation size
# otherwise, define the query's maxsize parameter value as whatever the caller passed in
if memory is None:
maxsize = ''
else:
maxsize = '[maxsize:{}]'.format(memory)
# define the query to send the API
# specifying way["highway"] means that all ways returned must have a highway key. the {filters} then remove ways by key/value.
# the '>' makes it recurse so we get ways and way nodes. maxsize is in bytes.
if by_bbox:
# turn bbox into a polygon and project to local UTM
polygon = Polygon([(west, south), (east, south), (east, north), (west, north)])
geometry_proj, crs_proj = project_geometry(polygon)
# subdivide it if it exceeds the max area size (in meters), then project back to lat-long
geometry_proj_consolidated_subdivided = consolidate_subdivide_geometry(geometry_proj, max_query_area_size=max_query_area_size)
geometry, crs = project_geometry(geometry_proj_consolidated_subdivided, crs=crs_proj, to_latlong=True)
log('Requesting network data within bounding box from API in {:,} request(s)'.format(len(geometry)))
start_time = time.time()
# loop through each polygon rectangle in the geometry (there will only be one if original bbox didn't exceed max area size)
for poly in geometry:
# represent bbox as south,west,north,east and round lat-longs to 8 decimal places (ie, within 1 mm) so URL strings aren't different due to float rounding issues (for consistent caching)
west, south, east, north = poly.bounds
query_template = '[out:json][timeout:{timeout}]{maxsize};(way["highway"]{filters}({south:.8f},{west:.8f},{north:.8f},{east:.8f});>;);out;'
query_str = query_template.format(north=north, south=south, east=east, west=west, filters=osm_filter, timeout=timeout, maxsize=maxsize)
response_json = overpass_request(data={'data':query_str}, timeout=timeout)
response_jsons.append(response_json)
log('Got all network data within bounding box from API in {:,} request(s) and {:,.2f} seconds'.format(len(geometry), time.time()-start_time))
elif by_poly:
# project to utm, divide polygon up into sub-polygons if area exceeds a max size (in meters), project back to lat-long, then get a list of polygon(s) exterior coordinates
geometry_proj, crs_proj = project_geometry(polygon)
geometry_proj_consolidated_subdivided = consolidate_subdivide_geometry(geometry_proj, max_query_area_size=max_query_area_size)
geometry, crs = project_geometry(geometry_proj_consolidated_subdivided, crs=crs_proj, to_latlong=True)
polygon_coord_strs = get_polygons_coordinates(geometry)
log('Requesting network data within polygon from API in {:,} request(s)'.format(len(polygon_coord_strs)))
start_time = time.time()
# pass each polygon exterior coordinates in the list to the API, one at a time
for polygon_coord_str in polygon_coord_strs:
query_template = '[out:json][timeout:{timeout}]{maxsize};(way["highway"]{filters}(poly:"{polygon}");>;);out;'
query_str = query_template.format(polygon=polygon_coord_str, filters=osm_filter, timeout=timeout, maxsize=maxsize)
response_json = overpass_request(data={'data':query_str}, timeout=timeout)
response_jsons.append(response_json)
log('Got all network data within polygon from API in {:,} request(s) and {:,.2f} seconds'.format(len(polygon_coord_strs), time.time()-start_time))
return response_jsons
def consolidate_subdivide_geometry(geometry, max_query_area_size):
"""
Consolidate a geometry into a convex hull, then subdivide it into smaller sub-polygons if its area exceeds max size (in geometry's units).
Parameters
----------
geometry : shapely Polygon or MultiPolygon
the geometry to consolidate and subdivide
max_query_area_size : float
max area for any part of the geometry, in the units the geometry is in.
any polygon bigger will get divided up for multiple queries to API (
default is 50,000 * 50,000 units (ie, 50km x 50km in area, if units are meters))
Returns
-------
geometry : Polygon or MultiPolygon
"""
# let the linear length of the quadrats (with which to subdivide the geometry) be the square root of max area size
quadrat_width = math.sqrt(max_query_area_size)
if not isinstance(geometry, (Polygon, MultiPolygon)):
raise ValueError('Geometry must be a shapely Polygon or MultiPolygon')
# if geometry is a MultiPolygon OR a single Polygon whose area exceeds the max size, get the convex hull around the geometry
if isinstance(geometry, MultiPolygon) or (isinstance(geometry, Polygon) and geometry.area > max_query_area_size):
geometry = geometry.convex_hull
# if geometry area exceeds max size, subdivide it into smaller sub-polygons
if geometry.area > max_query_area_size:
geometry = quadrat_cut_geometry(geometry, quadrat_width=quadrat_width)
if isinstance(geometry, Polygon):
geometry = MultiPolygon([geometry])
return geometry
def get_polygons_coordinates(geometry):
"""
Extract exterior coordinates from polygon(s) to pass to OSM in a query by polygon.
Parameters
----------
geometry : shapely Polygon or MultiPolygon
the geometry to extract exterior coordinates from
Returns
-------
polygon_coord_strs : list
"""
# extract the exterior coordinates of the geometry to pass to the API later
polygons_coords = []
if isinstance(geometry, Polygon):
x, y = geometry.exterior.xy
polygons_coords.append(list(zip(x, y)))
elif isinstance(geometry, MultiPolygon):
for polygon in geometry:
x, y = polygon.exterior.xy
polygons_coords.append(list(zip(x, y)))
else:
raise ValueError('Geometry must be a shapely Polygon or MultiPolygon')
# convert the exterior coordinates of the polygon(s) to the string format the API expects
polygon_coord_strs = []
for coords in polygons_coords:
s = ''
separator = ' '
for coord in list(coords):
# round floating point lats and longs to 14 places, so we can hash and cache strings consistently
s = '{}{}{:.14f}{}{:.14f}'.format(s, separator, coord[1], separator, coord[0])
polygon_coord_strs.append(s.strip(separator))
return polygon_coord_strs
def get_node(element):
"""
Convert an OSM node element into the format for a networkx node.
Parameters
----------
element : dict
an OSM node element
Returns
-------
dict
"""
node = {}
node['y'] = element['lat']
node['x'] = element['lon']
node['osmid'] = element['id']
if 'tags' in element:
for useful_tag in globals.useful_tags_node:
if useful_tag in element['tags']:
node[useful_tag] = element['tags'][useful_tag]
return node
def get_path(element):
"""
Convert an OSM way element into the format for a networkx graph path.
Parameters
----------
element : dict
an OSM way element
Returns
-------
dict
"""
path = {}
path['osmid'] = element['id']
# remove any consecutive duplicate elements in the list of nodes
grouped_list = groupby(element['nodes'])
path['nodes'] = [group[0] for group in grouped_list]
if 'tags' in element:
for useful_tag in globals.useful_tags_path:
if useful_tag in element['tags']:
path[useful_tag] = element['tags'][useful_tag]
return path
def parse_osm_nodes_paths(osm_data):
"""
Construct dicts of nodes and paths with key=osmid and value=dict of attributes.
Parameters
----------
osm_data : dict
JSON response from from the Overpass API
Returns
-------
nodes, paths : tuple
"""
nodes = {}
paths = {}
for element in osm_data['elements']:
if element['type'] == 'node':
key = element['id']
nodes[key] = get_node(element)
elif element['type'] == 'way': #osm calls network paths 'ways'
key = element['id']
paths[key] = get_path(element)
return nodes, paths
def remove_isolated_nodes(G):
"""
Remove from a graph all the nodes that have no incident edges (ie, node degree = 0).
Parameters
----------
G : networkx multidigraph
the graph from which to remove nodes
Returns
-------
networkx multidigraph
"""
isolated_nodes = [node for node, degree in dict(G.degree()).items() if degree < 1]
G.remove_nodes_from(isolated_nodes)
log('Removed {:,} isolated nodes'.format(len(isolated_nodes)))
return G
def truncate_graph_dist(G, source_node, max_distance=1000, weight='length', retain_all=False):
"""
Remove everything further than some network distance from a specified node in graph.
Parameters
----------
G : networkx multidigraph
source_node : int
the node in the graph from which to measure network distances to other nodes
max_distance : int
remove every node in the graph greater than this distance from the source_node
weight : string
how to weight the graph when measuring distance (default 'length' is how many meters long the edge is)
retain_all : bool
if True, return the entire graph even if it is not connected
Returns
-------
networkx multidigraph
"""
# get the shortest distance between the node and every other node, then remove every node further than max_distance away
start_time = time.time()
G = G.copy()
distances = nx.shortest_path_length(G, source=source_node, weight=weight)
distant_nodes = {key:value for key, value in dict(distances).items() if value > max_distance}
G.remove_nodes_from(distant_nodes.keys())
log('Truncated graph by weighted network distance in {:,.2f} seconds'.format(time.time()-start_time))
# remove any isolated nodes and retain only the largest component (if retain_all is True)
if not retain_all:
G = remove_isolated_nodes(G)
G = get_largest_component(G)
return G
def truncate_graph_bbox(G, north, south, east, west, truncate_by_edge=False, retain_all=False):
"""
Remove every node in graph that falls outside a bounding box.
Needed because overpass returns entire ways that also include nodes outside the bbox if the way
(that is, a way with a single OSM ID) has a node inside the bbox at some point.
Parameters
----------
G : networkx multidigraph
north : float
northern latitude of bounding box
south : float
southern latitude of bounding box
east : float
eastern longitude of bounding box
west : float
western longitude of bounding box
truncate_by_edge : bool
if True retain node if it's outside bbox but at least one of node's neighbors are within bbox
retain_all : bool
if True, return the entire graph even if it is not connected
Returns
-------
networkx multidigraph
"""
start_time = time.time()
G = G.copy()
nodes_outside_bbox = []
for node, data in G.nodes(data=True):
if data['y'] > north or data['y'] < south or data['x'] > east or data['x'] < west:
# this node is outside the bounding box
if not truncate_by_edge:
# if we're not truncating by edge, add node to list of nodes outside the bounding box
nodes_outside_bbox.append(node)
else:
# if we're truncating by edge, see if any of node's neighbors are within bounding box
any_neighbors_in_bbox = False
neighbors = list(G.successors(node)) + list(G.predecessors(node))
for neighbor in neighbors:
x = G.node[neighbor]['x']
y = G.node[neighbor]['y']
if y < north and y > south and x < east and x > west:
any_neighbors_in_bbox = True
# if none of its neighbors are within the bounding box, add node to list of nodes outside the bounding box
if not any_neighbors_in_bbox:
nodes_outside_bbox.append(node)
G.remove_nodes_from(nodes_outside_bbox)
log('Truncated graph by bounding box in {:,.2f} seconds'.format(time.time()-start_time))
# remove any isolated nodes and retain only the largest component (if retain_all is True)
if not retain_all:
G = remove_isolated_nodes(G)
G = get_largest_component(G)
return G
def quadrat_cut_geometry(geometry, quadrat_width, min_num=3, buffer_amount=1e-9):
"""
Split a Polygon or MultiPolygon up into sub-polygons of a specified size, using quadrats.
Parameters
----------
geometry : shapely Polygon or MultiPolygon
the geometry to split up into smaller sub-polygons
quadrat_width : numeric
the linear width of the quadrats with which to cut up the geometry (in the units the geometry is in)
min_num : int
the minimum number of linear quadrat lines (e.g., min_num=3 would produce a quadrat grid of 4 squares)
buffer_amount : numeric
buffer the quadrat grid lines by quadrat_width times buffer_amount
Returns
-------
shapely MultiPolygon
"""
# create n evenly spaced points between the min and max x and y bounds
west, south, east, north = geometry.bounds
x_num = math.ceil((east-west) / quadrat_width) + 1
y_num = math.ceil((north-south) / quadrat_width) + 1
x_points = np.linspace(west, east, num=max(x_num, min_num))
y_points = np.linspace(south, north, num=max(y_num, min_num))
# create a quadrat grid of lines at each of the evenly spaced points
vertical_lines = [LineString([(x, y_points[0]), (x, y_points[-1])]) for x in x_points]
horizont_lines = [LineString([(x_points[0], y), (x_points[-1], y)]) for y in y_points]
lines = vertical_lines + horizont_lines
# buffer each line to distance of the quadrat width divided by 1 billion, take their union, then cut geometry into pieces by these quadrats
buffer_size = quadrat_width * buffer_amount
lines_buffered = [line.buffer(buffer_size) for line in lines]
quadrats = unary_union(lines_buffered)
multipoly = geometry.difference(quadrats)
return multipoly
def intersect_index_quadrats(gdf, geometry, quadrat_width=0.025, min_num=3, buffer_amount=1e-9):
"""
Intersect points with a polygon, using an r-tree spatial index and cutting the polygon up into
smaller sub-polygons for r-tree acceleration.
Parameters
----------
gdf : GeoDataFrame
the set of points to intersect
geometry : shapely Polygon or MultiPolygon
the geometry to intersect with the points
quadrat_width : numeric
the linear length (in degrees) of the quadrats with which to cut up the geometry (default = 0.025, approx 2km at NYC's latitude)
min_num : int
the minimum number of linear quadrat lines (e.g., min_num=3 would produce a quadrat grid of 4 squares)
buffer_amount : numeric
buffer the quadrat grid lines by quadrat_width times buffer_amount
Returns
-------
GeoDataFrame
"""
# create an empty dataframe to append matches to
points_within_geometry = pd.DataFrame()
# cut the geometry into chunks for r-tree spatial index intersecting
multipoly = quadrat_cut_geometry(geometry, quadrat_width=quadrat_width, buffer_amount=buffer_amount)
# create an r-tree spatial index for the nodes (ie, points)
start_time = time.time()
sindex = gdf['geometry'].sindex
log('Created r-tree spatial index for {:,} points in {:,.2f} seconds'.format(len(gdf), time.time()-start_time))
# loop through each chunk of the geometry to find approximate and then precisely intersecting points
start_time = time.time()
for poly in multipoly:
# buffer by the tiny distance to account for any space lost in the quadrat cutting, otherwise may miss point(s) that lay directly on quadrat line
buffer_size = quadrat_width * buffer_amount
poly = poly.buffer(buffer_size).buffer(0)
# find approximate matches with r-tree, then precise matches from those approximate ones
if poly.is_valid and poly.area > 0:
possible_matches_index = list(sindex.intersection(poly.bounds))
possible_matches = gdf.iloc[possible_matches_index]
precise_matches = possible_matches[possible_matches.intersects(poly)]
points_within_geometry = points_within_geometry.append(precise_matches)
if len(points_within_geometry) > 0:
# drop duplicate points, if buffered poly caused an overlap on point(s) that lay directly on a quadrat line
points_within_geometry = points_within_geometry.drop_duplicates(subset='node')
else:
# after simplifying the graph, and given the requested network type, there are no nodes inside the polygon - can't create graph from that so throw error
raise Exception('There are no nodes within the requested geometry')
log('Identified {:,} nodes inside polygon in {:,.2f} seconds'.format(len(points_within_geometry), time.time()-start_time))
return points_within_geometry
def truncate_graph_polygon(G, polygon, retain_all=False, truncate_by_edge=False, quadrat_width=0.025, min_num=3, buffer_amount=1e-9):
"""
Remove every node in graph that falls outside some shapely Polygon or MultiPolygon.
Parameters
----------
G : networkx multidigraph
polygon : Polygon or MultiPolygon
only retain nodes in graph that lie within this geometry
retain_all : bool
if True, return the entire graph even if it is not connected
truncate_by_edge : bool
if True retain node if it's outside polygon but at least one of node's neighbors
are within polygon (NOT CURRENTLY IMPLEMENTED)
quadrat_width : numeric
passed on to intersect_index_quadrats: the linear length (in degrees) of the quadrats
with which to cut up the geometry (default = 0.025, approx 2km at NYC's latitude)
min_num : int
passed on to intersect_index_quadrats: the minimum number of linear quadrat lines
(e.g., min_num=3 would produce a quadrat grid of 4 squares)
buffer_amount : numeric
passed on to intersect_index_quadrats: buffer the quadrat grid lines by
quadrat_width times buffer_amount
Returns
-------
networkx multidigraph
"""
start_time = time.time()
G = G.copy()
log('Identifying all nodes that lie outside the polygon...')
# get a GeoDataFrame of all the nodes, for spatial analysis
node_geom = [Point(data['x'], data['y']) for _, data in G.nodes(data=True)]
gdf_nodes = gpd.GeoDataFrame({'node':pd.Series(G.nodes()), 'geometry':node_geom})
gdf_nodes.crs = G.graph['crs']
# find all the nodes in the graph that lie outside the polygon
points_within_geometry = intersect_index_quadrats(gdf_nodes, polygon, quadrat_width=quadrat_width, min_num=min_num, buffer_amount=buffer_amount)
nodes_outside_polygon = gdf_nodes[~gdf_nodes.index.isin(points_within_geometry.index)]
# now remove from the graph all those nodes that lie outside the place polygon
start_time = time.time()
G.remove_nodes_from(nodes_outside_polygon['node'])
log('Removed {:,} nodes outside polygon in {:,.2f} seconds'.format(len(nodes_outside_polygon), time.time()-start_time))
# remove any isolated nodes and retain only the largest component (if retain_all is True)
if not retain_all:
G = remove_isolated_nodes(G)
G = get_largest_component(G)
return G
def add_edge_lengths(G):