In [2]:
from dotenv import load_dotenv
import requests
import os
import pandas as pd
import overpy

In [3]:
api = overpy.Overpass()

In [4]:
features = pd.read_csv("all_map_features.csv")
features

Unnamed: 0,key,value
0,amenity,bar
1,amenity,biergarten
2,amenity,cafe
3,amenity,fast_food
4,amenity,food_court
...,...,...
141,shop,gift
142,shop,stationery
143,shop,party
144,shop,pawnbroker


In [13]:
query_start = """
[out:json];
area(3600175905)->.searchArea;
("""
# Area 175905 = NYC, add 3600000000 to get to work on OSM turbo
query_end = """
);
out center;
"""

for key in features['key'].unique():
    query = query_start
    for value in features.loc[features['key'] == key, 'value']:
        query += f'\n   node["{key}"="{value}"](area.searchArea);'
    
    query += query_end
    print(query)
    result = api.query(query)

    rows = []
    for node in result.nodes:
        row = {"id": node.id,
            "lat": node.lat.to_eng_string(),
            "lon": node.lon.to_eng_string()} | node.tags
        rows.append(row)

    df = pd.DataFrame(rows)
    threshold = int(len(df)*0.005)
    df_cleaned = df.dropna(axis=1, thresh=threshold)
    df_cleaned.to_csv(f"osm_places_{key}.csv", index=False)
    
    


[out:json];
area(3600175905)->.searchArea;
(
   node["amenity"="bar"](area.searchArea);
   node["amenity"="biergarten"](area.searchArea);
   node["amenity"="cafe"](area.searchArea);
   node["amenity"="fast_food"](area.searchArea);
   node["amenity"="food_court"](area.searchArea);
   node["amenity"="ice_cream"](area.searchArea);
   node["amenity"="pub"](area.searchArea);
   node["amenity"="restaurant"](area.searchArea);
   node["amenity"="college"](area.searchArea);
   node["amenity"="dancing_school"](area.searchArea);
   node["amenity"="library"](area.searchArea);
   node["amenity"="surf_school"](area.searchArea);
   node["amenity"="toy_library"](area.searchArea);
   node["amenity"="research_institute"](area.searchArea);
   node["amenity"="music_school"](area.searchArea);
   node["amenity"="university"](area.searchArea);
   node["amenity"="arts_centre"](area.searchArea);
   node["amenity"="cinema"](area.searchArea);
   node["amenity"="community_centre"](area.searchArea);
   node["amen

In [60]:
query = """
[out:json];
area[name="New York"]->.searchArea;
(
  node["amenity"="pub"](area.searchArea);
  node["amenity"="bar"](area.searchArea);
  node["amenity"="biergarten"](area.searchArea);
  node["amenity"="cafe"](area.searchArea);
  node["amenity"="fast_food"](area.searchArea);
  node["amenity"="food_court"](area.searchArea);
  node["amenity"="ice_cream"](area.searchArea);
  node["amenity"="pub"](area.searchArea);
  node["amenity"="restaurant"](area.searchArea);
);
out center;
"""

In [61]:
result = api.query(query)

In [62]:
for p in dir(result.nodes[0]):
    print(p)

__annotations__
__class__
__delattr__
__dict__
__dir__
__doc__
__eq__
__format__
__ge__
__getattribute__
__getstate__
__gt__
__hash__
__init__
__init_subclass__
__le__
__lt__
__module__
__ne__
__new__
__reduce__
__reduce_ex__
__repr__
__setattr__
__sizeof__
__str__
__subclasshook__
__weakref__
_result
_type_value
attributes
from_json
from_xml
get_center_from_json
get_center_from_xml_dom
id
lat
lon
tags


In [63]:
fnode = result.nodes[0]

In [64]:
print(set(result.nodes[0].tags.keys()))

{'gnis:feature_id', 'amenity', 'name', 'ele'}


In [66]:
print(len(result.nodes))

21607


In [67]:
cols = ["id", "lat", "lon"] + list(params)
df = pd.DataFrame(columns=cols)

In [68]:
rows = []
for node in result.nodes:
    row = {"id": node.id,
           "lat": node.lat.to_eng_string(),
           "lon": node.lon.to_eng_string()} | node.tags
    rows.append(row)


In [69]:
df = pd.DataFrame(rows)

In [70]:
df.head()

Unnamed: 0,id,lat,lon,amenity,ele,gnis:feature_id,name,addr:housenumber,addr:postcode,addr:street,...,payment:cashapp,natural,payment:alipay,payment:wechat,payment:account_cards,operational_status,comedy,service:bicycle:stand,diet:macrobiotic,spoaddr:street
0,158604258,42.3958183,-75.8084643,bar,312.0,965536.0,Seebers Tavern,,,,...,,,,,,,,,,
1,175181066,42.7680859,-78.6113159,restaurant,,,Griffon Gastropub,634.0,14052.0,Main St.,...,,,,,,,,,,
2,175181076,42.767621,-78.6083001,restaurant,,,"Tony Rome's ""The Globe""",,,,...,,,,,,,,,,
3,175181087,42.7675639,-78.6092902,restaurant,,,Rick's on Main,687.0,14052.0,Main St.,...,,,,,,,,,,
4,175181195,42.7674792,-78.6086259,restaurant,,,East End Tap Room,,,,...,,,,,,,,,,


In [71]:
print(df.columns.values)

['id' 'lat' 'lon' 'amenity' 'ele' 'gnis:feature_id' 'name'
 'addr:housenumber' 'addr:postcode' 'addr:street' 'cuisine' 'website'
 'addr:city' 'addr:state' 'phone' 'created_by'
 'nysgissam:nysaddresspointid' 'air_conditioning' 'indoor_seating'
 'outdoor_seating' 'smoking' 'wheelchair' 'addr:unit' 'branch' 'brand'
 'brand:wikidata' 'official_name' 'ref' 'takeaway' 'addr:country'
 'addr:county' 'drive_through' 'internet_access' 'opening_hours'
 'alt_name' 'delivery' 'allotments' 'cocktails' 'drink:beer'
 'drink:liquor' 'drink:wine' 'lgbtq' 'source' 'sport' 'tourism' 'wikidata'
 'wikipedia' 'internet_access:fee' 'building' 'email' 'old_name' 'comment'
 'url' 'amenity_1' 'shop' 'atm' 'opening_hours:kitchen' 'contact:website'
 'contact:email' 'contact:facebook' 'contact:instagram' 'contact:phone'
 'contact:twitter' 'diet:gluten_free' 'diet:vegan' 'opening_hours:dinner'
 'opening_hours:lunch' 'reservation' 'brewery' 'diet:vegetarian'
 'payment:cash' 'payment:credit_cards' 'payment:debit_cards

In [72]:
object_columns = df.select_dtypes(['object']).columns

In [73]:
second_val = []
second_freq = []
for category in object_columns:
    # print(df[category].value_counts().index.tolist())
    second_val.append(df[category].value_counts().index.tolist()[1])
    second_freq.append(df[category].value_counts().iloc[1])

IndexError: list index out of range

In [74]:
# Find duplicate rows
print("Number of duplicate rows:", df[df.duplicated()==True].shape[0])

Number of duplicate rows: 0


In [75]:
print(df.shape)

(21607, 492)


In [95]:
threshold = int(len(df)*0.01)
df_cleaned = df.dropna(axis=1, thresh=threshold)

In [96]:
df_cleaned.shape

(21607, 67)

In [97]:
df_cleaned.to_csv("osm_places.csv")

In [91]:
nullrows = df.isna().sum()

In [92]:
nulldf

id                           0
lat                          0
lon                          0
amenity                      0
ele                      21587
                         ...  
operational_status       21606
comedy                   21606
service:bicycle:stand    21606
diet:macrobiotic         21606
spoaddr:street           21606
Length: 492, dtype: int64

In [77]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,21607.0,6334429000.0,3161955000.0,158604258.0,3577779000.0,6006320000.0,9163169000.0,11943760000.0


In [78]:
decimal = fnode.lon
decimal.to_eng_string()

'-75.8084643'