In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
import numpy as np

In [2]:
df = pd.read_csv('osm_places_shop.csv')

threshold = int(0.03* df.shape[0])
df = df.dropna(axis=1, thresh=threshold)

In [3]:
df.columns

Index(['id', 'lat', 'lon', 'name', 'opening_hours', 'shop', 'website',
       'addr:city', 'addr:housenumber', 'addr:postcode', 'addr:state',
       'addr:street', 'branch', 'brand', 'brand:wikidata', 'phone', 'amenity',
       'outdoor_seating', 'wheelchair', 'email', 'takeaway', 'drink:espresso',
       'cuisine', 'delivery', 'drink:coffee', 'payment:cash', 'check_date',
       'payment:credit_cards', 'level'],
      dtype='object')

In [4]:
df.dtypes

id                        int64
lat                     float64
lon                     float64
name                     object
opening_hours            object
shop                     object
website                  object
addr:city                object
addr:housenumber         object
addr:postcode           float64
addr:state               object
addr:street              object
branch                   object
brand                    object
brand:wikidata           object
phone                    object
amenity                  object
outdoor_seating          object
wheelchair               object
email                    object
takeaway                 object
drink:espresso           object
cuisine                  object
delivery                 object
drink:coffee             object
payment:cash             object
check_date               object
payment:credit_cards     object
level                    object
dtype: object

In [5]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [6]:
['branch', 'brand', 'brand:wikidata', 'amenity',
        'takeaway', 'drink:espresso',
       'cuisine', 'delivery','payment:cash', 'check_date',
       'payment:credit_cards', 'level']

['branch',
 'brand',
 'brand:wikidata',
 'amenity',
 'takeaway',
 'drink:espresso',
 'cuisine',
 'delivery',
 'payment:cash',
 'check_date',
 'payment:credit_cards',
 'level']

In [7]:
df = df[['id', 'lat', 'lon', 'name', 'opening_hours', 'shop', 'website',
       'addr:city', 'addr:housenumber', 'addr:postcode', 'addr:state',
       'addr:street', 'phone', 'outdoor_seating', 'wheelchair', 'email',
       'drink:coffee']]

In [8]:
# Drop rows with missing name
df.drop(df[df['name'].isna()].index, inplace=True)

In [9]:
# Change postcode from float to int
df['addr:postcode'] = df['addr:postcode'].astype('Int64')

In [10]:
# Change NaN values in wheelchair feature to unknown
df.fillna({'wheelchair':'unknown'}, inplace=True)

In [11]:
# Change outdoor seating NaN values to unknown
df.fillna({'outdoor_seating':'unknown', 'drink:coffee':'unknown'}, inplace=True)

In [12]:
# Change all non 'no' and 'unknown' values to 'yes'"OSM Amenities Data Quality Plan.docx"
df.loc[(df['outdoor_seating'] != 'no') & (df['outdoor_seating'] != 'unknown'), 'outdoor_seating' ] = 'yes'

In [13]:
df.columns

Index(['id', 'lat', 'lon', 'name', 'opening_hours', 'shop', 'website',
       'addr:city', 'addr:housenumber', 'addr:postcode', 'addr:state',
       'addr:street', 'phone', 'outdoor_seating', 'wheelchair', 'email',
       'drink:coffee'],
      dtype='object')

In [14]:
df.head()

Unnamed: 0,id,lat,lon,name,opening_hours,shop,website,addr:city,addr:housenumber,addr:postcode,addr:state,addr:street,phone,outdoor_seating,wheelchair,email,drink:coffee
0,357623896,40.661101,-73.953359,Hee-Space Thrift Shop,Th-Su 12:00-18:00,charity,https://www.hee-space.com,,,,,,,unknown,unknown,,unknown
1,368053310,40.736926,-73.989601,Barnes & Noble,"Su 10:00-21:00; Mo-Th 09:00-21:00; Fr, Sa 09:0...",books,https://stores.barnesandnoble.com/store/2675,New York,33.0,10003.0,NY,East 17th Street,+1 212-253-0810,unknown,unknown,,unknown
2,368061395,40.878611,-73.917222,Knolls Shopping Center,,mall,,,,,NY,,,unknown,unknown,,unknown
3,418520887,40.636934,-74.076656,Everything Goes Book Cafe,,books,,Staten Island,208.0,,NY,Bay Street,,yes,limited,,unknown
4,419362653,40.727268,-73.990374,The Hidden Rose,Tu-Sa 11:30-20:00,tattoo,,,,,,,,unknown,unknown,,unknown


In [15]:
df.to_csv("osm_places_shop_cleaned.csv", index=False)