In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
import numpy as np

In [14]:
df = pd.read_csv('osm_places_leisure.csv')

threshold = int(0.03* df.shape[0])
df = df.dropna(axis=1, thresh=threshold)

In [15]:
df.columns

Index(['id', 'lat', 'lon', 'ele', 'gnis:feature_id', 'leisure', 'name',
       'wikidata', 'addr:city', 'addr:postcode', 'addr:state', 'operator',
       'opening_hours', 'website', 'addr:street', 'addr:housenumber',
       'dance:teaching', 'amenity', 'drink:beer', 'phone', 'toilets',
       'wheelchair', 'tourism', 'cuisine', 'email', 'access', 'garden:type'],
      dtype='object')

In [16]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [17]:
df = df[['id', 'lat', 'lon', 'leisure', 'name',
       'addr:city', 'addr:postcode', 'addr:state',
       'opening_hours', 'website', 'addr:street', 'addr:housenumber',
       'phone', 
       'wheelchair', 'email', 'access']]

In [18]:
# Drop rows with missing name
df.drop(df[df['name'].isna()].index, inplace=True)

In [19]:
# Drop private access places and drop feature
df.drop(df[df['access']=='private'].index, inplace=True)
df.drop('access', axis=1, inplace=True)

In [20]:
# Change NaN values in wheelchair feature to no
df.fillna({'wheelchair':'unknown'}, inplace=True)

In [21]:
df.columns

Index(['id', 'lat', 'lon', 'leisure', 'name', 'addr:city', 'addr:postcode',
       'addr:state', 'opening_hours', 'website', 'addr:street',
       'addr:housenumber', 'phone', 'wheelchair', 'email'],
      dtype='object')

In [22]:
df.head()

Unnamed: 0,id,lat,lon,leisure,name,addr:city,addr:postcode,addr:state,opening_hours,website,addr:street,addr:housenumber,phone,wheelchair,email
0,357545993,40.873155,-73.914025,park,Baker Field,,,,,,,,,no,
1,357562455,40.548161,-74.126253,park,Great Kills Park,,,,,,,,,no,
2,357570943,40.705658,-73.936249,park,Ramirez Playground,,,,,,,,,no,
3,357573219,40.596493,-74.171532,park,New Springville Park,,,,,,,,,no,
4,357594521,40.701492,-73.852913,park,Victory Field,,,,,,,,,no,


In [23]:
df['addr:postcode'] = df['addr:postcode'].astype('Int64')

In [27]:
df.loc[df['addr:postcode'].isna()]

Unnamed: 0,id,lat,lon,leisure,name,addr:city,addr:postcode,addr:state,opening_hours,website,addr:street,addr:housenumber,phone,wheelchair,email
0,357545993,40.873155,-73.914025,park,Baker Field,,,,,,,,,no,
1,357562455,40.548161,-74.126253,park,Great Kills Park,,,,,,,,,no,
2,357570943,40.705658,-73.936249,park,Ramirez Playground,,,,,,,,,no,
3,357573219,40.596493,-74.171532,park,New Springville Park,,,,,,,,,no,
4,357594521,40.701492,-73.852913,park,Victory Field,,,,,,,,,no,
5,357599530,40.849108,-73.822901,park,Rice Memorial Stadium,,,,,,,,,no,
6,357607564,40.728436,-73.830412,park,Meadow Park,,,,,,,,,no,
7,357607736,40.889266,-73.902358,park,Lower Forecourt,,,,,,,,,no,
8,357607739,40.889544,-73.90208,park,Upper Forecourt,,,,,,,,,no,
9,357607904,40.706511,-74.003373,park,South Street Seaport Historic District,,,,,,,,,no,


In [24]:
df.to_csv("osm_places_leisure_cleaned.csv", index=False)