This notebook pulls all historic sites from NYC on openstreetmaps

In [1]:
from dotenv import load_dotenv
import requests
import os
import pandas as pd
import overpy

In [2]:
api = overpy.Overpass()

In [11]:
query = """
[out:json];
area[name="New York"]->.searchArea;
(
  node[historic](area.searchArea);

);
out center;
"""

In [12]:
result = api.query(query)

In [13]:
for p in dir(result.nodes[0]):
    print(p)

__annotations__
__class__
__delattr__
__dict__
__dir__
__doc__
__eq__
__format__
__ge__
__getattribute__
__getstate__
__gt__
__hash__
__init__
__init_subclass__
__le__
__lt__
__module__
__ne__
__new__
__reduce__
__reduce_ex__
__repr__
__setattr__
__sizeof__
__str__
__subclasshook__
__weakref__
_result
_type_value
attributes
from_json
from_xml
get_center_from_json
get_center_from_xml_dom
id
lat
lon
tags


In [14]:
fnode = result.nodes[0]

In [15]:
print(set(result.nodes[0].tags.keys()))

{'highway', 'historic'}


In [16]:
print(len(result.nodes))

2629


In [18]:
rows = []
for node in result.nodes:
    row = {"id": node.id,
           "lat": node.lat.to_eng_string(),
           "lon": node.lon.to_eng_string()} | node.tags
    rows.append(row)


In [19]:
df = pd.DataFrame(rows)

In [20]:
df.head()

Unnamed: 0,id,lat,lon,highway,historic,ref,ele,name,tourism,abandoned,...,operator:type,inscription_1,museum,seats,resource,image:0,noname,source:url,covered,flickr
0,42435844,40.8165065,-73.9465427,traffic_signals,yes,,,,,,...,,,,,,,,,,
1,60645024,41.3909515,-73.5976769,motorway_junction,motorway_junction,9W,,,,,...,,,,,,,,,,
2,75387391,41.9983419,-79.5109256,,boundary_stone,212,,,,,...,,,,,,,,,,
3,75387393,41.9984091,-79.4913463,,boundary_stone,211,,,,,...,,,,,,,,,,
4,75387399,41.9985819,-79.3940195,,boundary_stone,206,,,,,...,,,,,,,,,,


In [21]:
print(df.columns.values)

['id' 'lat' 'lon' 'highway' 'historic' 'ref' 'ele' 'name' 'tourism'
 'abandoned' 'abandoned:place' 'gnis:feature_id' 'place' 'wikidata'
 'wikipedia' 'ref:nrhp' 'start_date' 'artist_name' 'material' 'memorial'
 'source' 'addr:city' 'addr:state' 'website' 'leisure' 'addr:housenumber'
 'addr:street' 'note' 'amenity' 'cemetery' 'tomb' 'building' 'old_name'
 'heritage' 'heritage:operator' 'heritage:website' 'nrhp:criteria'
 'nrhp:inscription_date' 'source_ref' 'addr:postcode' 'end_date'
 'religion' 'official_name' 'description' 'inscription' 'architect'
 'artwork_type' 'website:alternate' 'opening_hours' 'air_conditioning'
 'image' 'level' 'operator' 'phone' 'maintainer' 'man_made'
 'seamark:light:character' 'seamark:light:colour' 'seamark:light:height'
 'seamark:light:period' 'seamark:type' 'landuse' 'aeroway' 'alt_name'
 'artist:wikidata' 'artist:wikipedia' 'name:en' 'name:ja' 'name:ru'
 'subject:wikidata' 'subject:wikipedia' 'height' 'railway:historic'
 'wheelchair' 'wikimedia_commons' '

In [22]:
object_columns = df.select_dtypes(['object']).columns

In [24]:
# Find duplicate rows
print("Number of duplicate rows:", df[df.duplicated()==True].shape[0])

Number of duplicate rows: 0


In [25]:
print(df.shape)

(2629, 218)


In [26]:
threshold = int(len(df)*0.01)
df_cleaned = df.dropna(axis=1, thresh=threshold)

In [27]:
df_cleaned.shape

(2629, 36)

In [29]:
df_cleaned.to_csv("osm_places_historic.csv")

In [30]:
nullrows = df.isna().sum()

In [32]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,2629.0,6158944000.0,3545231000.0,42435844.0,3348896000.0,5897337000.0,9509392000.0,11928950000.0


In [33]:
decimal = fnode.lon
decimal.to_eng_string()

'-73.9465427'