In [47]:
!pip install folium
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import geopy
from geopy.geocoders import Nominatim
import folium
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
import json
from sklearn.cluster import KMeans



##### Retriving Data from Wikipedia using BeautifulSoup libraries 

In [48]:
req = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(req.content,'lxml')
table = soup.find_all('table')[0]
df1 = pd.read_html(str(table))
df=pd.DataFrame(df1[0])  #Transforming it to df DATAFRAME and t 

This data from Wikipedia is converted into a panda dataframe, and is now being processed with the help of various pandas features.

In [50]:
df.dropna(inplace=True)    #Removing the Nan data or errors!!
df['Neighborhood'].replace(to_replace="/",value=",",regex=True,inplace=True)  # Replacing slashes with commas.
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [6]:
df2=pd.read_csv('http://cocl.us/Geospatial_data')
df2.head()   #  Geospatial Data with postal codes.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merging the geospatial data and the processed data from wikipedia as a dataframe.

In [7]:
df3= pd.merge(df, df2,left_on='Postal code',right_on='Postal Code')
df3.drop(labels=['Postal Code'],axis=1,inplace=True)
df3.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494


#### Using geolactor to find the Latitude and Longitude, of Toronto,Ontario

In [8]:
address='Toronto,Ontario'    # This location is used to find the Latitude and longitude.
geoloc=Nominatim(user_agent='explorer')  # Nominaim library helps to find the location
location = geoloc.geocode(address)
longitude=location.longitude
latitude=location.latitude
print("Latitude:",latitude)
print("Longitude",longitude)

Latitude: 43.6534817
Longitude -79.3839347


#### Lets show a small map of toronto

In [9]:
maptoro = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(df3['Latitude'],df3['Longitude'], df3['Borough'], df3['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(maptoro)  
    
maptoro

#### Now lets do the same for data with new york, but here the data is available from the given URL

In [10]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('Data downloaded!')

Data downloaded!


In [11]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [12]:
neigh_data=newyork_data['features']

 #### Here we use the data from the net to convert it into a dataframe through an iteration by taking a piece of information from json file and adding it to the dataframe created below line by line.

In [13]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
df4 = pd.DataFrame(columns=column_names)


#### By this method easily a dataframe of new york(df4) is created having the Neighborhood with their respective longitudes and latitudes.

In [14]:
for data in neigh_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    df4= df4.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [15]:
df4.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [16]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude1 = location.latitude
longitude1 = location.longitude
print('Latotude:',latitude)
print("Longitude:", longitude)

Latotude: 43.6534817
Longitude: -79.3839347


#### This is a small map of New York, just like the one in Toronto.

In [17]:
mapnyc = folium.Map(location=[latitude1, longitude1], zoom_start=10)

for lat, lng, borough, neighborhood in zip(df4['Latitude'],df4['Longitude'], df4['Borough'], df4['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(mapnyc)  
    
mapnyc

#### Now with the help of the FourSquare account we would take the info for both the cities i.e New York and Toronto and try to analyse any simlarities based on their cluster formation and venues.

In [18]:
CLIENT_ID = 'B2KBZZZ1RTP5TJSWK124PV3SMUPIQZFLL4BHZ0X4UFKEVS2C' 
CLIENT_SECRET = 'J4ZZ5W2O2MC3WXRDY32N1L5HMXBVNCENQBSLEIFWBOEKYYHV' 
VERSION = '20200422'
limit=100
radius=500
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

CLIENT_ID: B2KBZZZ1RTP5TJSWK124PV3SMUPIQZFLL4BHZ0X4UFKEVS2C
CLIENT_SECRET:J4ZZ5W2O2MC3WXRDY32N1L5HMXBVNCENQBSLEIFWBOEKYYHV


#### Testing the URL

In [58]:
latitude = df3.loc[0, 'Latitude']  # Toronto
longitude =df3.loc[0, 'Longitude']

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    limit)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=B2KBZZZ1RTP5TJSWK124PV3SMUPIQZFLL4BHZ0X4UFKEVS2C&client_secret=J4ZZ5W2O2MC3WXRDY32N1L5HMXBVNCENQBSLEIFWBOEKYYHV&v=20200422&ll=43.7532586,-79.3296565&radius=500&limit=100'

#### Getting the nearby venues for all neighborhoods in using the CLIENT ID and CLIOENT SECRET for each neighborhood.

In [20]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toro_venues = getNearbyVenues(names=df3['Neighborhood'],
                                   latitudes=df3['Latitude'],
                                   longitudes=df3['Longitude']
                                  )
newyork_venues=getNearbyVenues(names=df4['Neighborhood'],
                               latitudes=df4['Latitude'],
                               longitudes=df4['Longitude']
                                  )
print("")

Parkwoods
Victoria Village
Regent Park , Harbourfront
Lawrence Manor , Lawrence Heights
Queen's Park , Ontario Provincial Government
Islington Avenue
Malvern , Rouge
Don Mills
Parkview Hill , Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park , Princess Gardens , Martin Grove , Islington , Cloverdale
Rouge Hill , Port Union , Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate , Bloordale Gardens , Old Burnhamthorpe , Markland Wood
Guildwood , Morningside , West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor , Wilson Heights , Downsview North
Thorncliffe Park
Richmond , Adelaide , King
Dufferin , Dovercourt Village
Scarborough Village
Fairview , Henry Farm , Oriole
Northwood Park , York University
East Toronto
Harbourfront East , Union Station , Toronto Islands
Little Portugal , Trinity
Kennedy Park , Ionview , East Birchmount Park
Bayview Village
Do

In [21]:
toro_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.753259,-79.329656,TTC stop - 44 Valley Woods,43.755402,-79.333741,Bus Stop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [22]:
newyork_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
2,Wakefield,40.894705,-73.847201,Walgreens,40.896528,-73.8447,Pharmacy
3,Wakefield,40.894705,-73.847201,Rite Aid,40.896649,-73.844846,Pharmacy
4,Wakefield,40.894705,-73.847201,Dunkin',40.890459,-73.849089,Donut Shop


### A testing of further work

In [23]:
toro_onehot = pd.get_dummies(toro_venues[['Venue Category']], prefix="", prefix_sep="")
toro_onehot['Neighborhood'] = toro_venues['Neighborhood'] 
fixed_columns = [toro_onehot.columns[-1]] + list(toro_onehot.columns[:-1])
toro_onehot =toro_onehot[fixed_columns]
toro_grouped = toro_onehot.groupby('Neighborhood').mean().reset_index()

In [24]:
num_top_venues = 5

for hood in toro_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toro_grouped[toro_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                       venue  freq
0  Latin American Restaurant  0.25
1                     Lounge  0.25
2             Breakfast Spot  0.25
3         Chinese Restaurant  0.25
4                     Market  0.00


----Alderwood , Long Branch----
            venue  freq
0     Pizza Place   0.2
1    Skating Rink   0.1
2  Sandwich Place   0.1
3             Gym   0.1
4    Dance Studio   0.1


----Bathurst Manor , Wilson Heights , Downsview North----
              venue  freq
0              Bank  0.11
1       Coffee Shop  0.11
2  Sushi Restaurant  0.05
3       Gas Station  0.05
4    Sandwich Place  0.05


----Bayview Village----
                      venue  freq
0        Chinese Restaurant  0.25
1                      Café  0.25
2                      Bank  0.25
3       Japanese Restaurant  0.25
4  Mediterranean Restaurant  0.00


----Bedford Park , Lawrence Manor East----
                venue  freq
0      Sandwich Place  0.09
1  Italian Restaurant  0.09
2          Restaura

In [25]:
nyc_onehot = pd.get_dummies(newyork_venues[['Venue Category']], prefix="", prefix_sep="")
nyc_onehot['Neighborhood'] = newyork_venues['Neighborhood'] 
fixed_columns = [nyc_onehot.columns[-1]] + list(nyc_onehot.columns[:-1])
nyc_onehot =nyc_onehot[fixed_columns]
nyc_grouped = nyc_onehot.groupby('Neighborhood').mean().reset_index()

In [26]:
num_top_venues = 5

for hood in nyc_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = nyc_grouped[nyc_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Allerton----
              venue  freq
0       Pizza Place  0.17
1     Deli / Bodega  0.13
2       Supermarket  0.07
3  Department Store  0.07
4      Dessert Shop  0.03


----Annadale----
         venue  freq
0  Pizza Place  0.14
1     Pharmacy  0.07
2   Restaurant  0.07
3         Park  0.07
4       Bakery  0.07


----Arden Heights----
           venue  freq
0  Deli / Bodega   0.2
1       Pharmacy   0.2
2       Bus Stop   0.2
3    Coffee Shop   0.2
4    Pizza Place   0.2


----Arlington----
                 venue  freq
0             Bus Stop  0.29
1        Deli / Bodega  0.14
2  American Restaurant  0.14
3         Intersection  0.14
4          Coffee Shop  0.14


----Arrochar----
                venue  freq
0            Bus Stop  0.17
1       Deli / Bodega  0.09
2  Italian Restaurant  0.09
3   Polish Restaurant  0.04
4            Pharmacy  0.04


----Arverne----
            venue  freq
0       Surf Spot  0.24
1  Sandwich Place  0.12
2   Metro Station  0.12
3           Beach  0.06
4

#### Making a function to get the most common venues

In [27]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Getting top 10 venues for each neighborhood for both the cities

In [28]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
venues_sorted_toro = pd.DataFrame(columns=columns)
venues_sorted_toro['Neighborhood'] = toro_grouped['Neighborhood']

for ind in np.arange(toro_grouped.shape[0]):
    venues_sorted_toro.iloc[ind, 1:] = return_most_common_venues(toro_grouped.iloc[ind, :], num_top_venues)

venues_sorted_toro.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Latin American Restaurant,Breakfast Spot,Chinese Restaurant,Discount Store,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner
1,"Alderwood , Long Branch",Pizza Place,Skating Rink,Pharmacy,Sandwich Place,Dance Studio,Pub,Athletics & Sports,Coffee Shop,Gym,Comfort Food Restaurant
2,"Bathurst Manor , Wilson Heights , Downsview North",Bank,Coffee Shop,Fried Chicken Joint,Ice Cream Shop,Pizza Place,Pharmacy,Restaurant,Middle Eastern Restaurant,Bridal Shop,Supermarket
3,Bayview Village,Japanese Restaurant,Café,Bank,Chinese Restaurant,Women's Store,Discount Store,Department Store,Dessert Shop,Dim Sum Restaurant,Diner
4,"Bedford Park , Lawrence Manor East",Pizza Place,Coffee Shop,Sandwich Place,Italian Restaurant,Restaurant,Thai Restaurant,Indian Restaurant,Liquor Store,Pub,Butcher


In [29]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
venues_sorted_nyc = pd.DataFrame(columns=columns)
venues_sorted_nyc['Neighborhood'] = nyc_grouped['Neighborhood']

for ind in np.arange(nyc_grouped.shape[0]):
    venues_sorted_nyc.iloc[ind, 1:] = return_most_common_venues(nyc_grouped.iloc[ind, :], num_top_venues)

venues_sorted_nyc.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Allerton,Pizza Place,Deli / Bodega,Department Store,Supermarket,Chinese Restaurant,Gas Station,Fast Food Restaurant,Bus Station,Pharmacy,Donut Shop
1,Annadale,Pizza Place,Bakery,Sports Bar,Sushi Restaurant,Restaurant,Train Station,Liquor Store,Diner,Food,Deli / Bodega
2,Arden Heights,Pizza Place,Deli / Bodega,Pharmacy,Bus Stop,Coffee Shop,Field,Event Service,Event Space,Exhibit,Eye Doctor
3,Arlington,Bus Stop,Intersection,Deli / Bodega,American Restaurant,Grocery Store,Coffee Shop,Filipino Restaurant,Event Space,Exhibit,Eye Doctor
4,Arrochar,Bus Stop,Deli / Bodega,Italian Restaurant,Liquor Store,Athletics & Sports,Sporting Goods Shop,Supermarket,Middle Eastern Restaurant,Mediterranean Restaurant,Outdoors & Recreation


#### Forming clusters and merging the venue data

In [30]:
k = 5
torocluster = toro_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=k, random_state=0).fit(torocluster)
kmeans.labels_[0:10] 
venues_sorted_toro.insert(0, 'Cluster Labels', kmeans.labels_.astype(int))
df3.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494


In [31]:
toro_merged = df3
toro_merged = toro_merged.join(venues_sorted_toro.set_index('Neighborhood'), on='Neighborhood')
toro_merged.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,0.0,Park,Food & Drink Shop,Bus Stop,Women's Store,Dim Sum Restaurant,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Discount Store
1,M4A,North York,Victoria Village,43.725882,-79.315572,1.0,Coffee Shop,Hockey Arena,Grocery Store,Portuguese Restaurant,Dim Sum Restaurant,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Diner
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636,1.0,Coffee Shop,Bakery,Pub,Park,Breakfast Spot,Restaurant,Café,Theater,Yoga Studio,Cosmetics Shop
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763,1.0,Clothing Store,Furniture / Home Store,Accessories Store,Coffee Shop,Vietnamese Restaurant,Boutique,Miscellaneous Shop,Event Space,Carpet Store,Women's Store
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494,1.0,Coffee Shop,Sushi Restaurant,Diner,Yoga Studio,Beer Bar,Japanese Restaurant,Café,Hobby Shop,Bank,Bar


#### Map for Toronto with Clusters

In [32]:
# create map
maptoro_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

colorss=['red', 'blue', 'green', 'purple', 'orange', 'darkred','lightred', 'beige', 'darkblue', 'darkgreen','lightgray']
i=0

markers_colors = []
for lat, lon, poi, cluster in zip(toro_merged['Latitude'], toro_merged['Longitude'], toro_merged['Neighborhood'],toro_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    i=i+1
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=colorss[i%k],
        fill=True,
        fill_color=colorss[i%k],
        fill_opacity=0.7).add_to(maptoro_clusters)
        
       
maptoro_clusters

#### Forming clusters and merging the venue data

In [33]:
k = 5
nyccluster = nyc_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=k, random_state=0).fit(nyccluster)
kmeans.labels_[0:10] 
venues_sorted_nyc.insert(0, 'Cluster Labels', kmeans.labels_.astype(int))
df4.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [34]:
nyc_merged = df4
nyc_merged = nyc_merged.join(venues_sorted_nyc.set_index('Neighborhood'), on='Neighborhood')
nyc_merged.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bronx,Wakefield,40.894705,-73.847201,0.0,Pharmacy,Donut Shop,Ice Cream Shop,Dessert Shop,Gas Station,Sandwich Place,Laundromat,Women's Store,Event Service,Event Space
1,Bronx,Co-op City,40.874294,-73.829939,3.0,Baseball Field,Bus Station,Restaurant,Park,Basketball Court,Liquor Store,Bagel Shop,Gift Shop,Salon / Barbershop,Pharmacy
2,Bronx,Eastchester,40.887556,-73.827806,3.0,Caribbean Restaurant,Deli / Bodega,Diner,Bus Station,Intersection,Bowling Alley,Seafood Restaurant,Fast Food Restaurant,Donut Shop,Pizza Place
3,Bronx,Fieldston,40.895437,-73.905643,3.0,Bus Station,River,Plaza,Women's Store,Field,Ethiopian Restaurant,Event Service,Event Space,Exhibit,Eye Doctor
4,Bronx,Riverdale,40.890834,-73.912585,3.0,Bus Station,Park,Bank,Plaza,Gym,Medical Supply Store,Baseball Field,Home Service,Playground,Food Truck


#### Map for New York City with Clusters

In [36]:
# create map
mapnyc_clusters = folium.Map(location=[latitude1, longitude1], zoom_start=11)

colorss=['red', 'blue', 'green', 'purple', 'orange', 'darkred','lightred', 'beige', 'darkblue', 'darkgreen','lightgray']
i=0

markers_colors = []
for lat, lon, poi, cluster in zip(nyc_merged['Latitude'], nyc_merged['Longitude'], nyc_merged['Neighborhood'],nyc_merged['Cluster Labels']):
    i=i+1
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=colorss[i%k],
        fill=True,
        fill_color=colorss[i%k],
        fill_opacity=0.7).add_to(mapnyc_clusters)      


   
mapnyc_clusters

#### The size of the five clusters of Toronto

In [39]:
(toro_merged.loc[toro_merged['Cluster Labels'] == 0,toro_merged.columns[[1] + list(range(5, toro_merged.shape[1]))]]).shape

(11, 12)

In [45]:
(toro_merged.loc[toro_merged['Cluster Labels'] == 1,toro_merged.columns[[1] + list(range(5, toro_merged.shape[1]))]]).shape

(84, 12)

In [46]:
(toro_merged.loc[toro_merged['Cluster Labels'] == 2,toro_merged.columns[[1] + list(range(5, toro_merged.shape[1]))]]).shape

(2, 12)

In [51]:
(toro_merged.loc[toro_merged['Cluster Labels'] == 3,toro_merged.columns[[1] + list(range(5, toro_merged.shape[1]))]]).shape

(1, 12)

In [52]:
(toro_merged.loc[toro_merged['Cluster Labels'] == 4,toro_merged.columns[[1] + list(range(5, toro_merged.shape[1]))]]).shape

(2, 12)

#### The size of the five clusters of New York City

In [53]:
(nyc_merged.loc[nyc_merged['Cluster Labels'] == 0,nyc_merged.columns[[1] + list(range(5, nyc_merged.shape[1]))]]).shape

(93, 11)

In [54]:
(nyc_merged.loc[nyc_merged['Cluster Labels'] == 1,nyc_merged.columns[[1] + list(range(5, nyc_merged.shape[1]))]]).shape

(4, 11)

In [55]:
(nyc_merged.loc[nyc_merged['Cluster Labels'] == 2,nyc_merged.columns[[1] + list(range(5, nyc_merged.shape[1]))]]).shape

(15, 11)

In [56]:
(nyc_merged.loc[nyc_merged['Cluster Labels'] == 3,nyc_merged.columns[[1] + list(range(5, nyc_merged.shape[1]))]]).shape

(67, 11)

In [57]:
(nyc_merged.loc[nyc_merged['Cluster Labels'] == 4,nyc_merged.columns[[1] + list(range(5, nyc_merged.shape[1]))]]).shape

(125, 11)