# NEW YORK DATA

In [2]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

..........
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.18.1-py_0 conda-forge

geographiclib- 100% |################################| Time: 0:00:00  21.78 MB/s
geopy-1.18.1-p 100% |################################| Time: 0:00:00  36.21 MB/s
Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  48.14 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  35.93 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00 

In [3]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('Data downloaded!')

Data downloaded!


In [4]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [5]:
neighborhoods_data = newyork_data['features']

In [6]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [7]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [8]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7308619, -73.9871558.


In [78]:
# CLIENT_ID = 'your-client-ID' # your Foursquare ID
# CLIENT_SECRET = 'your-client-secret' # your Foursquare Secret
# VERSION = '20180605' # Foursquare API version

In [11]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [17]:
radius = 250
LIMIT = 50
n = len(neighborhoods)
filtered_columns = ['Neighborhood', 'Borough', 'venue.id', 'venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
df_newyork_venues = pd.DataFrame([], columns = filtered_columns)
for i in range(0, n):
    lati = neighborhoods.loc[i, 'Latitude']
    long = neighborhoods.loc[i, 'Longitude']
    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, lati, long, VERSION, radius, LIMIT)
    results = requests.get(url).json()
    
    try:
        items = results['response']['groups'][0]['items']
        dataframe = json_normalize(items)
        m = len(dataframe)
        dataframe['Neighborhood'] = pd.DataFrame({'Neighborhood': [neighborhoods.loc[i, 'Neighborhood']] * m})
        dataframe['Borough'] = pd.DataFrame({'Borough': [neighborhoods.loc[i, 'Borough']] * m})
        dataframe['venue.categories'] = dataframe['venue.categories'].apply(lambda x: x[0]['name'])
        dataframe_fil = dataframe.loc[:, filtered_columns]
        df_newyork_venues = df_newyork_venues.append(dataframe_fil)
    except KeyError:
        pass

In [22]:
df_newyork_venues = df_newyork_venues.reset_index(drop=True)
# len(df_newyork_venues) = 3603
df_newyork_venues.head()

Unnamed: 0,Neighborhood,Borough,venue.id,venue.name,venue.categories,venue.location.lat,venue.location.lng
0,Wakefield,Bronx,4c537892fd2ea593cb077a28,Lollipops Gelato,Dessert Shop,40.894123,-73.845892
1,Wakefield,Bronx,4e440828a809d4ed1bb7059f,Pitman Deli,Food,40.894149,-73.845748
2,Co-op City,Bronx,4d2cfa5cad25224bbbc5fb8f,Capri II Pizza,Pizza Place,40.876374,-73.82994
3,Co-op City,Bronx,4be2b79d660ec9284d04ca3b,Townhouse Restaurant,Restaurant,40.876086,-73.828868
4,Co-op City,Bronx,4c59b7b72fa89c744fc81323,MTA MaBSTOA Bus Bx28 / Bx30 / Bx38 / MTA Bus B...,Bus Station,40.875943,-73.829404


In [28]:
nyc_count_borough = df_newyork_venues.groupby(['Borough']).count()['venue.categories']
nyc_count.head()

Borough
Bronx             387
Brooklyn          933
Manhattan        1306
Queens            682
Staten Island     295
Name: venue.categories, dtype: int64

In [71]:
countCategoriesPerNeighborhood_nyc = df_newyork_venues.groupby(['Neighborhood','venue.categories']).count()['venue.name'].reset_index()

list_Neighborhood_nyc = countCategoriesPerNeighborhood_nyc['Neighborhood'].unique()
y_nyc = pd.DataFrame(list_Neighborhood_nyc, columns = ['Neighborhood'])

list_venueCategories_nyc = countCategoriesPerNeighborhood_nyc['venue.categories'].unique()
df_allVenueCategories_nyc = pd.DataFrame(list_venueCategories_nyc, columns = ['venue.categories'])

X_nyc = pd.DataFrame([])

for Neighborhood in list_Neighborhood_nyc:
    df_CategoriesPerNeighborhood_nyc = countCategoriesPerNeighborhood_nyc[countCategoriesPerNeighborhood_nyc['Neighborhood'] == Neighborhood][['venue.categories','venue.name']]
    df_CategoriesPerNeighborhood_nyc = df_CategoriesPerNeighborhood_nyc.append(pd.DataFrame({'venue.categories': list(set(df_allVenueCategories_nyc['venue.categories'])-set(df_CategoriesPerNeighborhood_nyc['venue.categories'])), 'venue.name': [0] * len(list(set(df_allVenueCategories_nyc['venue.categories'])-set(df_CategoriesPerNeighborhood_nyc['venue.categories'])))})).reset_index(drop=True)
    df_mapping = pd.merge(df_allVenueCategories_nyc, df_CategoriesPerNeighborhood_nyc, on = 'venue.categories', how = 'left')
    X_nyc = X_nyc.append(df_mapping.T.iloc[1])
    
X_nyc = X_nyc.reset_index(drop = True)
X_nyc.columns = list_venueCategories_nyc
X_nyc

Unnamed: 0,Bus Station,Chinese Restaurant,Deli / Bodega,Discount Store,Donut Shop,Electronics Store,Fried Chicken Joint,Martial Arts Dojo,Pharmacy,Pizza Place,Spa,Supermarket,Bus Stop,Liquor Store,Beach,Bed & Breakfast,Playground,Brazilian Restaurant,Coffee Shop,Fast Food Restaurant,Gourmet Shop,Italian Restaurant,Lounge,Middle Eastern Restaurant,Plaza,Sculpture Garden,Bakery,Food,Laundromat,Shopping Mall,Diner,Halal Restaurant,Hookah Bar,Ice Cream Shop,Rental Car Location,Restaurant,American Restaurant,BBQ Joint,Boutique,Burger Joint,Burrito Place,Cupcake Shop,Department Store,Dog Run,Food Court,Food Truck,Gastropub,Gym,Men's Store,Park,Performing Arts Venue,Salad Place,Sandwich Place,Smoke Shop,Sushi Restaurant,Wine Bar,Women's Store,Bookstore,Caucasian Restaurant,Greek Restaurant,Grocery Store,Juice Bar,Mediterranean Restaurant,Optical Shop,Pool Hall,Seafood Restaurant,Clothing Store,Mobile Phone Shop,Residential Building (Apartment / Condo),Steakhouse,Tennis Court,Bank,Breakfast Spot,Spanish Restaurant,Asian Restaurant,Café,Health & Beauty Service,Indian Restaurant,Shipping Store,Vietnamese Restaurant,Convenience Store,Dessert Shop,Garden Center,Gym / Fitness Center,Lawyer,Yoga Studio,Bar,Piano Bar,Dumpling Restaurant,Moving Target,Russian Restaurant,Hotel,Movie Theater,Antique Shop,Cocktail Bar,French Restaurant,Furniture / Home Store,Japanese Restaurant,Kids Store,Paper / Office Supplies Store,Thrift / Vintage Store,Monument / Landmark,Pet Store,Cosmetics Shop,Food & Drink Shop,Korean Restaurant,Other Great Outdoors,Varenyky restaurant,Harbor / Marina,Hobby Shop,Other Nightlife,Intersection,Metro Station,Bagel Shop,Chocolate Shop,Eastern European Restaurant,Flower Shop,History Museum,Mattress Store,Mexican Restaurant,Pilates Studio,Salon / Barbershop,Thai Restaurant,Wine Shop,Recording Studio,Trail,Tex-Mex Restaurant,Dive Bar,Latin American Restaurant,New American Restaurant,Baseball Field,Pool,Caribbean Restaurant,Health Food Store,Nightclub,Community Center,Concert Hall,Dance Studio,Karaoke Bar,Ramen Restaurant,Shoe Store,Sports Bar,Arts & Crafts Store,Beer Garden,Butcher,Farmers Market,Fish Market,Gift Shop,Vegetarian / Vegan Restaurant,Arcade,Beer Bar,Cycle Studio,Ethiopian Restaurant,Music Venue,Southern / Soul Food Restaurant,Big Box Store,Bus Line,Irish Pub,Event Space,Hotel Pool,Speakeasy,Tapas Restaurant,Theater,Bike Shop,Bubble Tea Shop,Cantonese Restaurant,Dim Sum Restaurant,English Restaurant,General Entertainment,Historic Site,Hotpot Restaurant,Malay Restaurant,Museum,Noodle House,Record Shop,Roof Deck,Szechuan Restaurant,Tea Room,Check Cashing Service,Sporting Goods Shop,Video Game Store,Cuban Restaurant,Falafel Restaurant,Medical Center,Molecular Gastronomy Restaurant,Home Service,Building,Comedy Club,Hotel Bar,Indie Theater,Peruvian Restaurant,Pie Shop,Pub,Rock Climbing Spot,Market,Massage Studio,Argentinian Restaurant,Lingerie Store,Taco Place,Video Store,Track,Office,South American Restaurant,Train Station,Cheese Shop,Creperie,Food Stand,Hawaiian Restaurant,Hot Dog Joint,Polish Restaurant,Shanghai Restaurant,Art Gallery,Board Shop,Boxing Gym,Candy Store,Climbing Gym,Scenic Lookout,Science Museum,Print Shop,Street Art,Event Service,Paella Restaurant,Beer Store,Moroccan Restaurant,Pet Café,Swiss Restaurant,Gay Bar,Automotive Shop,Campground,Farm,Accessories Store,Jewelry Store,Israeli Restaurant,Miscellaneous Shop,Sports Club,Dry Cleaner,Church,Fruit & Vegetable Store,Supplement Shop,Boat or Ferry,Leather Goods Store,Pier,Doctor's Office,Garden,Rock Club,Bike Rental / Bike Share,Filipino Restaurant,German Restaurant,Wings Joint,Gymnastics Gym,Laundry Service,Vape Store,Indie Movie Theater,Jazz Club,Snack Place,Exhibit,Auto Workshop,Construction & Landscaping,College Academic Building,IT Services,Bistro,Turkish Restaurant,Music School,Bridal Shop,Comfort Food Restaurant,Cultural Center,Nail Salon,Frozen Yogurt Shop,Cajun / Creole Restaurant,Outdoors & Recreation,Racetrack,Afghan Restaurant,Czech Restaurant,Non-Profit,Toy / Game Store,Art Museum,Circus,College Bookstore,Field,Fountain,High School,Library,Opera House,School,Animal Shelter,Design Studio,Newsstand,Athletics & Sports,Hostel,Post Office,Japanese Curry Restaurant,Indoor Play Area,Basketball Court,General College & University,Theme Park Ride / Attraction,Golf Course,Outdoor Sculpture,Fish & Chips Shop,Jewish Restaurant,Scandinavian Restaurant,Bowling Alley,Skating Rink,Himalayan Restaurant,Sake Bar,Arepa Restaurant,Piercing Parlor,Poke Place,Chiropractor,African Restaurant,Shop & Service,Hardware Store,South Indian Restaurant,Other Repair Shop,Brewery,Distillery,Photography Studio,Tanning Salon,North Indian Restaurant,Taiwanese Restaurant,Smoothie Shop,Neighborhood,Music Store,Theme Park,Business Service,Baseball Stadium,Gym Pool,Airport Terminal,Romanian Restaurant,Train,Adult Boutique,Spiritual Center,Strip Club,Sri Lankan Restaurant,Rafting,River,Duty-free Shop,Tourist Information Center,College Gym,Factory,Laser Tag,Austrian Restaurant,Perfume Shop,Soup Place
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,4.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


summary New York City:
number of neighborhoods = 276
number of veune categories = 343
number of venues per Borough:
    Bronx             387
    Brooklyn          933
    Manhattan        1306
    Queens            682
    Staten Island     295

# TORONTO DATA

In [34]:
import pandas as pd

wikipedia_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = pd.read_html(wikipedia_link)[0]
df.columns = ['PostalCode', 'Borough', 'Neighborhood']
df = df.iloc[1:]

df = df[df['Borough']!='Not assigned']

index = df.loc[df['Neighborhood']=='Not assigned'].index.values
df.loc[index, 'Neighborhood'] = df.loc[index, 'Borough']

series = df.groupby(['PostalCode']).apply(lambda x: ', '.join(x['Neighborhood']))
df2 = series.to_frame().reset_index()
df2.columns = ['PostalCode', 'Neighborhood']
df = pd.merge(df[['PostalCode','Borough']], df2, on='PostalCode', how='right')
df = df.drop_duplicates(['PostalCode'])  

df = df.reset_index(drop=True)
df.shape

(103, 3)

In [35]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [36]:
!pip install --user geocoder

Collecting geocoder
  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K    100% |████████████████████████████████| 102kB 7.2MB/s ta 0:00:01
[?25hRequirement not upgraded as not directly required: six in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: click in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Requirement not upgraded as not directly required: requests in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: future in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement 

In [52]:
import geocoder # import geocoder

latitude = []
longitude = []

#for borough in df['Borough']:
    #g = geocoder.osm('{}, Toronto, Ontario'.format(borough))
    #latitude.append(g.osm['y'])
    #longitude.append(g.osm['x'])

for pc in df['PostalCode']:
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.bing('{}, Toronto, Ontario'.format(pc), key='AqawTOeswMmRDXPFbRfMorbLJ0QBIN0TvBe7uwqgiT7Tko9WvIoNUI0LMho82LwE')        
        lat_lng_coords = g.latlng

    latitude.append(g.latlng[0])
    longitude.append(g.latlng[1])

df3 = pd.DataFrame({'PostalCode': df['PostalCode'], 'Latitude': latitude}) 
df4 = pd.DataFrame({'PostalCode': df['PostalCode'], 'Longitude': longitude})
df_new = pd.merge(df, df3, on = 'PostalCode')
df_new = pd.merge(df_new, df4, on = 'PostalCode')
df_new.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.751255,-79.329895
1,M4A,North York,Victoria Village,43.729958,-79.314201
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65522,-79.361969
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.722801,-79.450691
4,M7A,Queen's Park,Queen's Park,43.664486,-79.393021
5,M9A,Etobicoke,Islington Avenue,43.662743,-79.528427
6,M1B,Scarborough,"Rouge, Malvern",43.810154,-79.194603
7,M3B,North York,Don Mills North,43.749134,-79.362007
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.707577,-79.310913
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657467,-79.377708


In [54]:
import numpy as np # library to handle data in a vectorized manner
import json # library to handle JSON files
import requests # library to handle requests

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans

In [55]:
radius = 250
n = len(latitude)
filtered_columns = ['PostalCode', 'Borough', 'venue.id', 'venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
df_new2 = pd.DataFrame([], columns = filtered_columns)
for i in range(0, n):
    lati = latitude[i]
    long = longitude[i]
    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, lati, long, VERSION, radius, LIMIT)
    results = requests.get(url).json()
    
    try:
        items = results['response']['groups'][0]['items']
        dataframe = json_normalize(items)
        m = len(dataframe)
        dataframe['PostalCode'] = pd.DataFrame({'PostalCode': [df_new.loc[i, 'PostalCode']] * m})
        dataframe['Borough'] = pd.DataFrame({'Borough': [df_new.loc[i, 'Borough']] * m})
        dataframe['venue.categories'] = dataframe['venue.categories'].apply(lambda x: x[0]['name'])
        dataframe_fil = dataframe.loc[:, filtered_columns]
        df_new2 = df_new2.append(dataframe_fil)
    except KeyError:
        pass


df_new2.head(10)

Unnamed: 0,PostalCode,Borough,venue.id,venue.name,venue.categories,venue.location.lat,venue.location.lng
0,M3A,North York,4e8d9dcdd5fbbbb6b3003c7b,Brookbanks Park,Park,43.751976,-79.33214
0,M5A,Downtown Toronto,54ea41ad498e9a11e9e13308,Roselle Desserts,Bakery,43.653447,-79.362017
1,M5A,Downtown Toronto,53b8466a498e83df908c3f21,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,M5A,Downtown Toronto,4af59046f964a520e0f921e3,Figs Breakfast & Lunch,Breakfast Spot,43.655675,-79.364503
3,M5A,Downtown Toronto,50760559e4b0e8c7babe2497,Body Blitz Spa East,Spa,43.654735,-79.359874
4,M5A,Downtown Toronto,4ae5b91ff964a520a6a121e3,Morning Glory Cafe,Breakfast Spot,43.653947,-79.361149
5,M5A,Downtown Toronto,4b58dd55f964a5208f6f28e3,The Yoga Lounge,Yoga Studio,43.655515,-79.364955
6,M5A,Downtown Toronto,4b3cfbb5f964a5201e8b25e3,Redline Coffee and Espresso,Café,43.655692,-79.364095
7,M5A,Downtown Toronto,4bc39c914cdfc9b6f29c9721,Souvlaki Express,Greek Restaurant,43.655584,-79.364438
8,M5A,Downtown Toronto,4bd8630535aad13a4bb890f3,Subway,Sandwich Place,43.655624,-79.364554


In [65]:
countCategoriesPerNeighborhood = df_new2.groupby(['PostalCode','venue.categories']).count()['venue.name'].reset_index()

list_Neighborhood = countCategoriesPerNeighborhood['PostalCode'].unique()
y = pd.DataFrame(list_Neighborhood, columns = ['PostalCode'])

list_venueCategories = countCategoriesPerNeighborhood['venue.categories'].unique()
df_allVenueCategories = pd.DataFrame(list_venueCategories, columns = ['venue.categories'])

X = pd.DataFrame([])

for pc in list_Neighborhood:
    df_CategoriesPerNeighborhood = countCategoriesPerNeighborhood[countCategoriesPerNeighborhood['PostalCode'] == pc][['venue.categories','venue.name']]
    df_CategoriesPerNeighborhood = df_CategoriesPerNeighborhood.append(pd.DataFrame({'venue.categories': list(set(df_allVenueCategories['venue.categories'])-set(df_CategoriesPerNeighborhood['venue.categories'])), 'venue.name': [0] * len(list(set(df_allVenueCategories['venue.categories'])-set(df_CategoriesPerNeighborhood['venue.categories'])))})).reset_index(drop=True)
    df_mapping = pd.merge(df_allVenueCategories, df_CategoriesPerNeighborhood, on = 'venue.categories', how = 'left')
    X = X.append(df_mapping.T.iloc[1])
    
X = X.reset_index(drop = True)
X.columns = list_venueCategories
X

Unnamed: 0,Soccer Field,Fast Food Restaurant,Pizza Place,Gym,Gym Pool,Bakery,Brewery,Construction & Landscaping,Wine Shop,Auto Garage,Intersection,Chinese Restaurant,Discount Store,Grocery Store,Shanghai Restaurant,Sushi Restaurant,Vietnamese Restaurant,Golf Course,Pharmacy,Bus Stop,Residential Building (Apartment / Condo),Baseball Field,Bank,Electronics Store,Park,Asian Restaurant,Bike Shop,Clothing Store,Dim Sum Restaurant,Italian Restaurant,Sporting Goods Shop,Bar,Coffee Shop,Japanese Restaurant,Massage Studio,Metro Station,Shopping Mall,Gym / Fitness Center,Sandwich Place,Breakfast Spot,Café,Pet Store,Playground,Health Food Store,Pub,Trail,Convenience Store,Restaurant,Indian Restaurant,Diner,Photography Studio,Dessert Shop,Thai Restaurant,Athletics & Sports,Light Rail Station,Supermarket,Tennis Court,American Restaurant,Beer Store,Bistro,General Entertainment,Bookstore,Burger Joint,Dog Run,Flower Shop,Gay Bar,Ice Cream Shop,Juice Bar,Men's Store,Salon / Barbershop,Sports Bar,Steakhouse,Strip Club,Tea Room,Theme Restaurant,Furniture / Home Store,Greek Restaurant,Spa,Yoga Studio,Art Gallery,Burrito Place,College Rec Center,Ethiopian Restaurant,Hookah Bar,Lake,Middle Eastern Restaurant,Music Venue,Other Great Outdoors,Ramen Restaurant,Taco Place,BBQ Joint,Camera Store,Church,Cosmetics Shop,Creperie,Food Truck,Gastropub,Hostel,Hotel,Korean Restaurant,Performing Arts Venue,Poke Place,Speakeasy,Liquor Store,Art Museum,Bubble Tea Shop,Miscellaneous Shop,Seafood Restaurant,Building,Cupcake Shop,Deli / Bodega,Food Court,General Travel,Gluten-free Restaurant,Monument / Landmark,Office,Plaza,Salad Place,Vegetarian / Vegan Restaurant,Wine Bar,Beach,Beer Garden,Scenic Lookout,Soup Place,Beer Bar,Museum,Tailor Shop,Butcher,Comfort Food Restaurant,Hardware Store,Health & Beauty Service,French Restaurant,Mexican Restaurant,College Gym,Noodle House,Video Game Store,Cocktail Bar,Dumpling Restaurant,Gaming Cafe,Hotpot Restaurant,Snack Place,Toy / Game Store,Caribbean Restaurant,Train Station,Brazilian Restaurant,Colombian Restaurant,Concert Hall,Dance Studio,Lounge,Opera House,Poutine Place,Theater,Women's Store,Field,Hockey Arena,Bus Line,Mac & Cheese Joint,Record Shop,Food,Nightclub,Smoothie Shop,Stadium,Eastern European Restaurant,Fish Market,Gourmet Shop,Falafel Restaurant,Frozen Yogurt Shop,Optical Shop,Shoe Store,Pool,Other Repair Shop,Filipino Restaurant,Home Service,Fried Chicken Joint,Video Store
0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


summary Toronto:
number of neighborhoods = 73
number of venue categories = 175
number of venues per Borough:
    Central Toronto      27
    Downtown Toronto    419
    East Toronto         56
    East York            12
    Etobicoke            23
    Mississauga          42
    North York           59
    Queen's Park          5
    Scarborough          23
    West Toronto         98
    York                 10

In [79]:
toronto_count_borough = df_new2.groupby(['Borough']).count()['venue.categories']
toronto_count_borough

Borough
Central Toronto      27
Downtown Toronto    419
East Toronto         56
East York            12
Etobicoke            23
Mississauga          42
North York           59
Queen's Park          5
Scarborough          23
West Toronto         98
York                 10
Name: venue.categories, dtype: int64

In [83]:
list_venueCategories_in_both_lists = list(set(list_venueCategories).intersection(set(list_venueCategories_nyc)))
len(list_venueCategories_in_both_lists)

159

There are 159 venue categories in both cities.

In [86]:
list_all_venue_categories = list(set(list_venueCategories) | set(list_venueCategories_nyc))
len(list_all_venue_categories)

359

There are 359 venue categories / features in total.

In [98]:
X_in_total = pd.concat([X_nyc, X], axis = 0, ignore_index=True, keys=['NYC', 'TRNT'])
X_in_total = X_in_total.replace(np.NaN, 0)
# X_in_total.apply(lambda x: sum(x), axis = 0)
# len(X_in_total.apply(lambda x: sum(x), axis = 0)[X_in_total.apply(lambda x: sum(x), axis = 0) == 0])

X_in_total.shape = 349 rows x 359 columns
In total we have 349 Neighborhoods (276 in New York, 73 in Toronto) and 359 venue categories.

In [108]:
n_clusters = 10

k_means = KMeans(init="k-means++", n_clusters=10, n_init=12)
k_means.fit(X_in_total)

k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_

y_in_total = pd.concat([y_nyc, y])
yWithClusterNumber = pd.concat([y_in_total.reset_index(drop=True), pd.DataFrame(k_means_labels.tolist(), columns = ["Cluster number"])], axis=1)
yWithClusterNumber.columns = ['NYC', 'TRNT', 'Cluster number']
#yWithClusterNumber = pd.merge(df_new[['Borough', 'Latitude', 'Longitude']], yWithClusterNumber, on='Borough', how='right').groupby(['Borough']).mean().reset_index()


In [109]:
yWithClusterNumber.groupby(['Cluster number']).count()

Unnamed: 0_level_0,NYC,TRNT
Cluster number,Unnamed: 1_level_1,Unnamed: 2_level_1
0,6,0
1,206,58
2,0,3
3,2,0
4,1,0
5,0,1
6,31,3
7,2,0
8,27,5
9,1,3


# number of elements per cluster and per city (10 cluster):

| Cluster  | NYC  | TRNT  |
|:---------|:----:|:-----:|
| 0        | 6    | 0     |
| 1        | 206  | 58    |
| 2        | 0    | 3     |
| 3        | 2    | 0     |
| 4        | 1    | 0     |
| 5        | 0    | 1     |
| 6        | 31   | 3     |
| 7        | 2    | 0     |
| 8        | 27   | 5     |
| 9        | 1    | 3     |

In [114]:
k_means_cluster_centers.shape

(10, 359)

# Cluster analysis regarding cluster features: What are the characteristics of each cluster?

In [158]:
merge_analysis_biggest_diff = pd.DataFrame([], columns = ['cluster','venue.categories','biggest diff'])

df_all_venue_categories = pd.DataFrame(list_all_venue_categories, columns = ['venue.categories'])
n_biggest_diff = 4 # the four most important characteristics of each cluster

for i in range(0, n_clusters):
    array_biggest_diff = np.sort(((np.absolute(k_means_cluster_centers[i] - k_means_cluster_centers[list(set(list(range(0, n_clusters))) - set([i])), :]))).min(axis=0), axis=None)[list(range(len(list_all_venue_categories)-n_biggest_diff,len(list_all_venue_categories))),]
    df_biggest_diff = pd.DataFrame(array_biggest_diff.tolist(), columns = ['biggest diff']).reset_index(drop=True)
    
    list_rows_biggest_diff = (np.argsort(((np.absolute(k_means_cluster_centers[i] - k_means_cluster_centers[list(set(list(range(0, n_clusters))) - set([i])), :]))).min(axis=0), axis=None)[list(range(len(list_all_venue_categories)-n_biggest_diff,len(list_all_venue_categories))),]).tolist()
    df_venue_categories = df_all_venue_categories.loc[list_rows_biggest_diff, 'venue.categories'].reset_index(drop=True)
    
    df_cluster = pd.DataFrame(n_biggest_diff*[i], columns = ['cluster']).reset_index(drop=True)
    
    merge_analysis_biggest_diff = merge_analysis_biggest_diff.append(pd.concat([df_cluster, df_venue_categories, df_biggest_diff], axis = 1))
    

merge_analysis_biggest_diff = merge_analysis_biggest_diff.reset_index(drop=True)
merge_analysis_biggest_diff.head(10)

Unnamed: 0,cluster,venue.categories,biggest diff
0,0,Gaming Cafe,0.75
1,0,Cocktail Bar,1.177083
2,0,Stadium,1.5
3,0,Poke Place,1.627451
4,1,Hotel Pool,0.118316
5,1,Poke Place,0.121212
6,1,Bistro,0.276292
7,1,Gift Shop,0.397727
8,2,Event Space,0.996212
9,2,Boutique,1.0


In [159]:
# Cluster 0
i = 0
begin = 4*i
end = 4*i+4
merge_analysis_biggest_diff.loc[list(range(begin, end)), ['cluster','venue.categories','biggest diff']]

Unnamed: 0,cluster,venue.categories,biggest diff
0,0,Gaming Cafe,0.75
1,0,Cocktail Bar,1.177083
2,0,Stadium,1.5
3,0,Poke Place,1.627451


The cluster 0 is characterized by the number of venue categories "Poke Place" and "Stadium".

| Cluster  | venue.categories  | biggest diff |
|:---------|:-----------------:|:------------:|
| 0        | Gaming Cafe       | 0.8          |
| 0        | Cocktail Bar      | 1.2          |
| 0        | Stadium           | 1.5          |
| 0        | Poke Place        | 1.6          |

In [160]:
# Cluster 1
i = 1
begin = 4*i
end = 4*i+4
merge_analysis_biggest_diff.loc[list(range(begin, end)), ['cluster','venue.categories','biggest diff']]

Unnamed: 0,cluster,venue.categories,biggest diff
4,1,Hotel Pool,0.118316
5,1,Poke Place,0.121212
6,1,Bistro,0.276292
7,1,Gift Shop,0.397727


The cluster 1 does not distinguish itself by any special venue categories.

| Cluster  | venue.categories  | biggest diff |
|:---------|:-----------------:|:------------:|
| 1        | Hotel Pool        | 0.1          |
| 1        | Poke Place        | 0.1          |
| 1        | Bistro            | 0.3          |
| 1        | Gift Shop         | 0.4          |

In [161]:
# Cluster 2
i = 2
begin = 4*i
end = 4*i+4
merge_analysis_biggest_diff.loc[list(range(begin, end)), ['cluster','venue.categories','biggest diff']]

Unnamed: 0,cluster,venue.categories,biggest diff
8,2,Event Space,0.996212
9,2,Boutique,1.0
10,2,Massage Studio,1.0
11,2,Laundry Service,2.5


The cluster 2 is characterized by the number of venue categories "Laundry Service" and "Massage Studio".

| Cluster  | venue.categories  | biggest diff |
|:---------|:-----------------:|:------------:|
| 2        | Event Space       | 1.0          |
| 2        | Boutique          | 1.0          |
| 2        | Massage Studio    | 1.0          |
| 2        | Laundry Service   | 2.5          |

In [162]:
# Cluster 3
i = 3
begin = 4*i
end = 4*i+4
merge_analysis_biggest_diff.loc[list(range(begin, end)), ['cluster','venue.categories','biggest diff']]

Unnamed: 0,cluster,venue.categories,biggest diff
12,3,Bistro,1.0
13,3,Business Service,1.352941
14,3,Juice Bar,1.40625
15,3,Climbing Gym,7.34375


The cluster 3 is characterized by the number of venue category "Climbing Gym".

| Cluster  | venue.categories  | biggest diff |
|:---------|:-----------------:|:------------:|
| 3        | Bistro            | 1.0          |
| 3        | Business Service  | 1.4          |
| 3        | Juice Bar         | 1.4          |
| 3        | Climbing Gym      | 7.3          |

In [163]:
# Cluster 4
i = 4
begin = 4*i
end = 4*i+4
merge_analysis_biggest_diff.loc[list(range(begin, end)), ['cluster','venue.categories','biggest diff']]

Unnamed: 0,cluster,venue.categories,biggest diff
16,4,Antique Shop,1.0
17,4,Flower Shop,2.0
18,4,Thrift / Vintage Store,3.0
19,4,Stadium,4.0


The cluster 4 is characterized by the number of venue categories "Stadium", "Thrift / Vintage Store" and "Flower Shop".

| Cluster  | venue.categories  | biggest diff |
|:---------|:-----------------:|:------------:|
| 4        | Antique Shop      | 1.0          |
| 4        | Flower Shop       | 2.0          |
| 4        | Vintage Store     | 3.0          |
| 4        | Stadium           | 4.0          |

In [164]:
# Cluster 5
i = 5
begin = 4*i
end = 4*i+4
merge_analysis_biggest_diff.loc[list(range(begin, end)), ['cluster','venue.categories','biggest diff']]

Unnamed: 0,cluster,venue.categories,biggest diff
20,5,Movie Theater,1.25
21,5,High School,1.823529
22,5,Gift Shop,2.0
23,5,Gym Pool,2.25


The cluster 5 is characterized by the number of venue categories "Gym Pool" and "Gift Shop".

| Cluster  | venue.categories  | biggest diff |
|:---------|:-----------------:|:------------:|
| 5        | Movie Theater     | 1.3          |
| 5        | High School       | 1.8          |
| 5        | Gift Shop         | 2.0          |
| 5        | Gym pool          | 2.3          |

In [165]:
# Cluster 6
i = 6
begin = 4*i
end = 4*i+4
merge_analysis_biggest_diff.loc[list(range(begin, end)), ['cluster','venue.categories','biggest diff']]

Unnamed: 0,cluster,venue.categories,biggest diff
24,6,College Gym,0.227941
25,6,Dessert Shop,0.235294
26,6,Gay Bar,0.235294
27,6,Shanghai Restaurant,0.283088


The cluster 6 does not distinguish itself by any special venue categories.

| Cluster  | venue.categories  | biggest diff |
|:---------|:-----------------:|:------------:|
| 6        | College Gym       | 0.2          |
| 6        | Dessert Shop      | 0.2          |
| 6        | Gay Bar           | 0.2          |
| 6        | Shanghai Restau.  | 0.3          |

In [166]:
# Cluster 7
i = 7
begin = 4*i
end = 4*i+4
merge_analysis_biggest_diff.loc[list(range(begin, end)), ['cluster','venue.categories','biggest diff']]

Unnamed: 0,cluster,venue.categories,biggest diff
28,7,Neighborhood,1.333333
29,7,Malay Restaurant,1.46875
30,7,Hookah Bar,1.84375
31,7,American Restaurant,7.0


The cluster 7 is characterized by the number of venue category "American Restaurant".

| Cluster  | venue.categories  | biggest diff |
|:---------|:-----------------:|:------------:|
| 7        | Neighborhood      | 1.3          |
| 7        | Malay Restau.     | 1.5          |
| 7        | Hookah Bar        | 1.8          |
| 7        | American Restau.  | 7.0          |

In [169]:
# Cluster 8
i = 8
begin = 4*i
end = 4*i+4
merge_analysis_biggest_diff.loc[list(range(begin, end)), ['cluster','venue.categories','biggest diff']]

Unnamed: 0,cluster,venue.categories,biggest diff
32,8,Miscellaneous Shop,0.269886
33,8,Burrito Place,0.288603
34,8,Middle Eastern Restaurant,0.678309
35,8,Bistro,1.125


The cluster 8 is characterized by the number of venue category "Bistro".

| Cluster  | venue.categories  | biggest diff |
|:---------|:-----------------:|:------------:|
| 8        | Miscellaneous Shop| 0.3          |
| 8        | Burrito Place     | 0.3          |
| 8        | Middle Eastern Res| 0.7          |
| 8        | Bistro            | 1.1          |

In [170]:
# Cluster 9
i = 9
begin = 4*i
end = 4*i+4
merge_analysis_biggest_diff.loc[list(range(begin, end)), ['cluster','venue.categories','biggest diff']]

Unnamed: 0,cluster,venue.categories,biggest diff
36,9,Gym Pool,0.75
37,9,Gift Shop,0.75
38,9,Movie Theater,0.75
39,9,Massage Studio,1.0


The cluster 9 is characterized by the number of venue category "Massage Studio".

| Cluster  | venue.categories  | biggest diff |
|:---------|:-----------------:|:------------:|
| 9        | Gym Pool          | 0.8          |
| 9        | Gift Shop         | 0.8          |
| 9        | Movie Theater     | 0.8          |
| 9        | Massage Studio    | 1.0          |

# Which Neighborhood has most of offer? 

In [185]:
series_most_offer = X_in_total.apply(lambda x: sum(x), axis = 1)
max_value = series_most_offer.max()
max_value_neighborhood = y_in_total.loc[series_most_offer.values.argmax(), :]
print(max_value," ",max_value_neighborhood)

53.0   Neighborhood    Murray Hill
PostalCode              NaN
Name: 173, dtype: object


The Neighborhood Murray Hill has most of offer with 53 venues. Under the assumption that foursquare provides complete information regarding the venues per Neighborhood. (unrealistic)