# Segmenting and Clustering Neighborhoods in Toronto
#### This is a notebook for the project "Segmenting and Clustering Neighborhoods in Toronto" for coursera course: Applied Data Science Capstone

## 1. Generating the DataFrame of Neighborhoods in Toronto

In [2]:
import pandas as pd
import numpy as np
import requests

In [3]:
# the url of the wiki page
url = 'http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# use pd.read_html to scrap the table in the html page, and use match='Borough' to return the table that has information Borough'
table = pd.read_html(url,match='Borough')


#### The first dataframe in the list is the one we want!

In [4]:
df = table[0]
df.head()

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


#### Use the first row as the column name

In [4]:
df.columns= df.iloc[0,:]

# use reset_index(drop=True) to reset the index and drop the old index
df.drop(0).reset_index(drop=True).head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [347]:
# count the number of unique postcode
print ('There are {} Postcodes.'.format(df.Postcode.unique().shape[0]))
# count the number of Borough
print ('There are {} Boroughs.'.format(df.Borough.unique().shape[0]))

There are 181 Postcodes
There are 13 Boroughs


#### Create a new dataframe that excludes borough with 'Not assigned'

In [348]:
df_Tor = pd.DataFrame(columns = {'Postcode','Borough','Neighbourhood'})

for post,bor,nb in zip(df['Postcode'],df['Borough'],df['Neighbourhood']):
 
 if post!='Postcode':
 if bor!='Not assigned'and nb=='Not assigned':
 df_Tor = df_Tor.append({'Postcode': post, 'Borough': bor,
 'Neighbourhood': bor}, ignore_index=True)
 elif bor!='Not assigned'and nb!='Not assigned':
 df_Tor = df_Tor.append({'Postcode': post,'Borough': bor,
 'Neighbourhood': nb}, ignore_index=True)

#number of unique postcode after removing 'Not assigned' values
print ('There are {} Postcodes after cleaning unavailabel ones.'.format(df_Tor['Postcode'].unique().shape[0]))

There are 103 Postcodes after cleaning unavailabel ones.


#### Concatenate Neighbourhoods sharing the same postcode


In [278]:
# combine neighbourhood sharing the same postcode
df_new = df_Tor.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ','.join(x)).reset_index()
df_new.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### The shape of the new dataframe after concatenating


In [279]:
df_new.shape

(103, 3)

In [None]:
''' example code using beautifulsoup
import pandas as pd
import numpy as np
import requests

from bs4 import BeautifulSoup


source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source, 'html5lib')

postal_codes_dict = {} # initialize an empty dictionary to save the data in
for table_cell in soup.find_all('td'):
 try:
 postal_code = table_cell.p.b.text # get the postal code
 postal_code_investigate = table_cell.span.text
 neighborhoods_data = table_cell.span.text # get the rest of the data in the cell
 borough = neighborhoods_data.split('(')[0] # get the borough in the cell
 
 # if the cell is not assigned then ignore it
 if neighborhoods_data == 'Not assigned':
 neighborhoods = []
 # else process the data and add it to the dictionary
 else:
 postal_codes_dict[postal_code] = {}
 
 try:
 neighborhoods = neighborhoods_data.split('(')[1]
 
 # remove parantheses from neighborhoods string
 neighborhoods = neighborhoods.replace('(', ' ')
 neighborhoods = neighborhoods.replace(')', ' ')

 neighborhoods_names = neighborhoods.split('/')
 neighborhoods_clean = ', '.join([name.strip() for name in neighborhoods_names])
 except:
 borough = borough.strip('\n')
 neighborhoods_clean = borough
 
 # add borough and neighborhood to dictionary
 postal_codes_dict[postal_code]['borough'] = borough
 postal_codes_dict[postal_code]['neighborhoods'] = neighborhoods_clean
 except:
 pass
 
# create an empty dataframe
columns = ['PostalCode', 'Borough', 'Neighborhood']
toronto_data = pd.DataFrame(columns=columns)
toronto_data

# populate dataframe with data from dictionary
for ind, postal_code in enumerate(postal_codes_dict):
 borough = postal_codes_dict[postal_code]['borough']
 neighborhood = postal_codes_dict[postal_code]['neighborhoods']
 toronto_data = toronto_data.append({"PostalCode": postal_code, 
 "Borough": borough, 
 "Neighborhood": neighborhood},
 ignore_index=True)

# print number of rows of dataframe
toronto_data.shape[0]

'''

## 2. Get the geo-location for each Postcode

In [303]:
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

postal_code = 'M1C'

for i, postal_code in enumerate(df_new['Postcode']):
 
 g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
 lat_lng_coords = g.latlng
 # loop until you get the coordinates
 while(lat_lng_coords is None):
 g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
 lat_lng_coords = g.latlng
 
 latitude = lat_lng_coords[0]
 longitude = lat_lng_coords[1]
 df_new.loc[i,'Latitude']= latitude
 df_new.loc[i,'Longitude']= longitude
 

df_new.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.785665,-79.158725
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.765815,-79.175193
3,M1G,Scarborough,Woburn,43.768369,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944
5,M1J,Scarborough,Scarborough Village,43.743125,-79.23175
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.726276,-79.263625
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.713054,-79.285055
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.724235,-79.227925
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.69677,-79.259967


## 3. Explore and cluster the neighborhoods in Toronto

#### To visulise the result, import the required libraries

In [27]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install folium
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


You are using pip version 18.1, however version 19.3.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


#### Create map of Toronto using latitude and longitude values

The red circles highlight boroughs names with 'Toronto'

In [46]:
g = geocoder.arcgis('Davisville,Toronto, Ontario')
lat_lng_coords = g.latlng
latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

# a scale bar is also added to the map
map_tt = folium.Map(location=[latitude, longitude], zoom_start=11,control_scale=True)

# add markers to map
for lat, lng, borough, neighborhood,postcode in zip(df_new['Latitude'], df_new['Longitude'], df_new['Borough'], df_new['Neighbourhood'],df_new['Postcode']):
 label = '{}, <{} {}>'.format(neighborhood, borough,postcode) # added <> to distinguish the boroughs and postcode from neighbourhood
 label = folium.Popup(label, parse_html=True)
 
 if 'Toronto' in borough:
 folium.CircleMarker(
 [lat, lng],
 radius=5,
 popup=label,
 color='red',
 fill=True,
 fill_color='#3186cc',
 fill_opacity=0.7,
 parse_html=False).add_to(map_tt) 
 else:
 folium.CircleMarker(
 [lat, lng],
 radius=5,
 popup=label,
 color='blue',
 fill=True,
 fill_color='#3186cc',
 fill_opacity=0.7,
 parse_html=False).add_to(map_tt) 
 
map_tt

#### Use Foursquare API to explore the neighborhoods and segment them

In [6]:
# The code was removed by Watson Studio for sharing.

Start from a particular postcode

In [120]:
# we start from a particular postcode
postal_code = 'M5B'
g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
lat_lng_coords = g.latlng
latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

# we choose a large radius since some postcode areas are much larger than the other. 
# we can filter the result using the postalcode information from the json file and make sure to collect venues only in that postcode area.
radius = 2000
LIMIT = 100 
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, LIMIT)
url

# get the result from foursquare.com
results = requests.get(url).json()
#results


In [97]:
# function that extracts the category of the venue
def get_category_type(row):
 try:
 categories_list = row['categories']
 except:
 categories_list = row['venue.categories']
 
 if len(categories_list) == 0:
 return None
 else:
 return categories_list[0]['name']

In [121]:
venues = results['response']['groups'][0]['items']
 
nearby_venues = json_normalize(venues) # flatten JSON
nearby_venues.head(2)

Unnamed: 0,reasons.count,reasons.items,referralId,venue.categories,venue.id,venue.location.address,venue.location.cc,venue.location.city,venue.location.country,venue.location.crossStreet,venue.location.distance,venue.location.formattedAddress,venue.location.labeledLatLngs,venue.location.lat,venue.location.lng,venue.location.neighborhood,venue.location.postalCode,venue.location.state,venue.name,venue.photos.count,venue.photos.groups,venue.venuePage.id
0,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-56d4d1b3cd1035fe77e1492c-0,"[{'id': '4bf58dd8d48988d16d941735', 'name': 'C...",56d4d1b3cd1035fe77e1492c,106 Mutual St,CA,Toronto,Canada,btwn Dundas & Gould St,175,"[106 Mutual St (btwn Dundas & Gould St), Toron...","[{'label': 'display', 'lat': 43.65777161112601...",43.657772,-79.376073,,M5B 2R7,ON,Page One Cafe,0,[],
1,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-57eda381498ebe0e6ef40972-1,"[{'id': '4bf58dd8d48988d103951735', 'name': 'C...",57eda381498ebe0e6ef40972,220 Yonge St,CA,Toronto,Canada,at Dundas St W,255,"[220 Yonge St (at Dundas St W), Toronto ON M5B...","[{'label': 'display', 'lat': 43.65591027779457...",43.65591,-79.380641,Downtown Toronto,M5B 2H1,ON,UNIQLO ユニクロ,0,[],


In [122]:
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng', 'venue.location.postalCode']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns, use the last part of the filtered_columns name as the column name of the new dataframe
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

# add the searching post code
nearby_venues['Postcode'] = postal_code

nearby_venues.head()
#nearby_venues.shape()


Unnamed: 0,name,categories,lat,lng,postalCode,Postcode
0,Page One Cafe,Café,43.657772,-79.376073,M5B 2R7,M5B
1,UNIQLO ユニクロ,Clothing Store,43.65591,-79.380641,M5B 2H1,M5B
2,Blaze Pizza,Pizza Place,43.656518,-79.380015,M5B 2G9,M5B
3,Silver Snail Comics,Comic Shop,43.657031,-79.381403,M5B 1R7,M5B
4,The Grand Hotel & Suites Toronto,Hotel,43.656449,-79.37411,M5B 2C1,M5B


In [107]:
nearby_venues.dtypes

name object
categories object
lat float64
lng float64
postalCode object
Postcode object
dtype: object

In [349]:

# filter the result where the returned 'postalCode' does not match the searched 'Postcode'

for i, pc_r, pc_s in zip(nearby_venues.index,nearby_venues.postalCode, nearby_venues.Postcode):
 if str(pc_s) not in str(pc_r): 
 #print(nearby_venues.iloc[i,:])
 nearby_venues.drop([i],inplace=True)

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng,postalCode,Postcode
0,Page One Cafe,Café,43.657772,-79.376073,M5B 2R7,M5B
1,UNIQLO ユニクロ,Clothing Store,43.65591,-79.380641,M5B 2H1,M5B
2,Blaze Pizza,Pizza Place,43.656518,-79.380015,M5B 2G9,M5B
3,Silver Snail Comics,Comic Shop,43.657031,-79.381403,M5B 1R7,M5B
4,The Grand Hotel & Suites Toronto,Hotel,43.656449,-79.37411,M5B 2C1,M5B


Sort the venues according to the frequency

In [136]:
nearby_venues['categories'].value_counts()

Ramen Restaurant 2
Theater 2
Pizza Place 1
Café 1
Bookstore 1
Clothing Store 1
Hotel 1
Historic Site 1
Plaza 1
Japanese Restaurant 1
Comic Shop 1
Supermarket 1
Cosmetics Shop 1
Fast Food Restaurant 1
Shopping Mall 1
Name: categories, dtype: int64

 ### Now combine all together

In [264]:
# define a function that do all the above operations

def nearby_venues_postcode(postcodes,latitudes,longitudes):
 
 nearby_venues_postcode = pd.DataFrame()
 
 for pc, lat, lng in zip(postcodes, latitudes, longitudes):
 
 radius = 2000
 LIMIT = 100
 
 url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, lat, lng, VERSION, radius, LIMIT)
 # get the result from foursquare.com
 results = requests.get(url).json()

 # convert the result to dataframe
 venues = results['response']['groups'][0]['items']
 nearby_venues = json_normalize(venues) # flatten JSON
 
 # filter columns
 filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng', 'venue.location.postalCode']
 nearby_venues =nearby_venues.loc[:, filtered_columns]

 # filter the category for each row
 nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

 # clean columns, use the last part of the filtered_columns name as the column name of the new dataframe
 nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

 # add the searching post code
 nearby_venues['Postcode'] = pc
 
 
 # The loop below filter the result where the returned 'postalCode' in the jason file does not match the searched 'Postcode', this is 
 # to avoid collecting venues that do not belong to the searched postcode area.
 for i, pc_r, pc_s in zip(nearby_venues.index,nearby_venues.postalCode, nearby_venues.Postcode):
 if str(pc_s) not in str(pc_r): 
 #print(nearby_venues.iloc[i,:])
 nearby_venues.drop([i],inplace=True)
 
 #print(pc)
 
 if len(nearby_venues) == 0:
 print (pc, 'no venues matched')
 nearby_venues_postcode = nearby_venues_postcode.append(nearby_venues, ignore_index=True)
 #print('nearby_venues_postcode.shape', nearby_venues_postcode.shape)

 return nearby_venues_postcode


In [265]:
# get the venues in all postcode area
nearby_venues_postcodes = nearby_venues_postcode(df_new['Postcode'],df_new['Latitude'], df_new['Longitude'])

M1X no venues matched
M5W no venues matched
M7A no venues matched
M7R no venues matched
M7Y no venues matched


##### The above 5 postcode areas return no matched venues, we will ignor them in the following evaluation

In [350]:
# reconfirm the dimension of the dataFrame after further clearning

print(len(nearby_venues_postcodes['Postcode'].unique()))

98


In [266]:
nearby_venues_postcodes.head()

Unnamed: 0,name,categories,lat,lng,postalCode,Postcode
0,Toronto Zoo,Zoo,43.820582,-79.181551,M1B 5K7,M1B
1,Images Salon & Spa,Spa,43.802283,-79.198565,M1B 3W3,M1B
2,BeaverTails,Dessert Shop,43.823376,-79.184616,M1B 5K7,M1B
3,LCBO,Liquor Store,43.796671,-79.204586,M1B 3C3,M1B
4,Petro-Canada,Gas Station,43.807831,-79.171431,M1B 5R9,M1B


In [198]:
# define a new dataframe that only keeps the postcode, name and catogories information, that's all we need for the clustering

df_venues_postcodes = nearby_venues_postcodes[['Postcode','name','categories']]
df_venues_postcodes.head()

Unnamed: 0,Postcode,name,categories
0,M1B,Toronto Zoo,Zoo
1,M1B,Images Salon & Spa,Spa
2,M1B,BeaverTails,Dessert Shop
3,M1B,LCBO,Liquor Store
4,M1B,Petro-Canada,Gas Station


In [205]:
# number of unique catogories of venues
print ('There are {} unique categories'.format(df_venues_postcodes['categories'].unique().shape[0]))

There are 262 unique categories


In [207]:
# use one-hot function to get the counts for each categories 
onehot_venues_postcodes = df_venues_postcodes['Postcode']

df_onehot = pd.get_dummies(df_venues_postcodes[['categories']], prefix="", prefix_sep="")

onehot_venues_postcodes = pd.concat([onehot_venues_postcodes,df_onehot], axis=1,)

onehot_venues_postcodes.head()

Unnamed: 0,Postcode,Afghan Restaurant,African Restaurant,American Restaurant,Arcade,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,Automotive Shop,BBQ Joint,Baby Store,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Baseball Stadium,Basketball Court,Basketball Stadium,Beach,Beer Bar,Beer Store,Big Box Store,Bike Shop,Bistro,Boat or Ferry,Bookstore,Botanical Garden,Boutique,Bowling Alley,Brazilian Restaurant,Breakfast Spot,Brewery,Bridal Shop,Bubble Tea Shop,Building,Burger Joint,Burrito Place,Business Service,Butcher,Café,Candy Store,Cantonese Restaurant,Caribbean Restaurant,Castle,Chinese Restaurant,Chocolate Shop,Church,Churrascaria,Climbing Gym,Clothing Store,Cocktail Bar,Coffee Shop,College Quad,Comedy Club,Comfort Food Restaurant,Comic Shop,Concert Hall,Convenience Store,Cosmetics Shop,Creperie,Cuban Restaurant,Cupcake Shop,Curling Ice,Dance Studio,Deli / Bodega,Dentist's Office,Department Store,Design Studio,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dive Bar,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Fish & Chips Shop,Fish Market,Flea Market,Flower Shop,Food,Food & Drink Shop,Food Court,Food Truck,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Fruit & Vegetable Store,Furniture / Home Store,Gaming Cafe,Garden,Gas Station,Gastropub,Gay Bar,General Entertainment,Gift Shop,Go Kart Track,Golf Course,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Gymnastics Gym,Hakka Restaurant,Halal Restaurant,Hardware Store,Hawaiian Restaurant,Health Food Store,Historic Site,Hobby Shop,Hockey Arena,Hong Kong Restaurant,Hostel,Hotel,Hotel Bar,Hotpot Restaurant,Ice Cream Shop,Indian Restaurant,Indie Movie Theater,Indonesian Restaurant,Intersection,Irish Pub,Italian Restaurant,Japanese Restaurant,Jewelry Store,Jewish Restaurant,Juice Bar,Kitchen Supply Store,Korean Restaurant,Laser Tag,Latin American Restaurant,Leather Goods Store,Lingerie Store,Liquor Store,Lounge,Malay Restaurant,Market,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Mobile Phone Shop,Monument / Landmark,Movie Theater,Museum,Music School,Music Store,Music Venue,Nail Salon,New American Restaurant,Nightclub,Noodle House,Optical Shop,Organic Grocery,Outdoor Supply Store,Pakistani Restaurant,Paper / Office Supplies Store,Park,Pastry Shop,Performing Arts Venue,Persian Restaurant,Peruvian Restaurant,Pet Service,Pet Store,Pharmacy,Photography Studio,Pide Place,Pier,Pizza Place,Playground,Plaza,Poke Place,Pool,Pool Hall,Portuguese Restaurant,Pub,Racecourse,Racetrack,Ramen Restaurant,Record Shop,Recreation Center,Restaurant,Rock Climbing Spot,Rock Club,Salad Place,Salon / Barbershop,Sandwich Place,Scandinavian Restaurant,Scenic Lookout,School,Science Museum,Seafood Restaurant,Shoe Store,Shopping Mall,Skating Rink,Smoke Shop,Smoothie Shop,Snack Place,Soccer Field,Soccer Stadium,Soup Place,South American Restaurant,Spa,Spanish Restaurant,Speakeasy,Sporting Goods Shop,Sports Bar,Sports Club,Sri Lankan Restaurant,Stables,Stationery Store,Steakhouse,Supermarket,Supplement Shop,Sushi Restaurant,Szechuan Restaurant,Taco Place,Taiwanese Restaurant,Tapas Restaurant,Tattoo Parlor,Tea Room,Tennis Stadium,Thai Restaurant,Theater,Theme Park,Theme Park Ride / Attraction,Theme Restaurant,Thrift / Vintage Store,Tibetan Restaurant,Toy / Game Store,Train Station,Turkish Restaurant,Udon Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,M1B,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,M1B,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,M1B,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,M1B,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,M1B,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Calculate the frequencies of categories

In the lab example, the frequencies are calculated based on the mean value of each postcode area, i.e. 
 frequency for a particular category = number of this category / total number of all categories in this postcode area.

Another way to estimate a category in a particular postcode area is to evaluate their frequencies over the total number of the whole Toronto. i.e. 
frequency for a particular category = number of this category / total number of this category in the Toronto area.


We will use the first method 

In [351]:
venues_postcodes_freq = onehot_venues_postcodes.groupby('Postcode').mean().reset_index()
venues_postcodes_freq.head()


Unnamed: 0,Postcode,Afghan Restaurant,African Restaurant,American Restaurant,Arcade,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,Automotive Shop,BBQ Joint,Baby Store,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Baseball Stadium,Basketball Court,Basketball Stadium,Beach,Beer Bar,Beer Store,Big Box Store,Bike Shop,Bistro,Boat or Ferry,Bookstore,Botanical Garden,Boutique,Bowling Alley,Brazilian Restaurant,Breakfast Spot,Brewery,Bridal Shop,Bubble Tea Shop,Building,Burger Joint,Burrito Place,Business Service,Butcher,Café,Candy Store,Cantonese Restaurant,Caribbean Restaurant,Castle,Chinese Restaurant,Chocolate Shop,Church,Churrascaria,Climbing Gym,Clothing Store,Cocktail Bar,Coffee Shop,College Quad,Comedy Club,Comfort Food Restaurant,Comic Shop,Concert Hall,Convenience Store,Cosmetics Shop,Creperie,Cuban Restaurant,Cupcake Shop,Curling Ice,Dance Studio,Deli / Bodega,Dentist's Office,Department Store,Design Studio,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dive Bar,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Electronics Store,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Fish & Chips Shop,Fish Market,Flea Market,Flower Shop,Food,Food & Drink Shop,Food Court,Food Truck,French Restaurant,Fried Chicken Joint,Frozen Yogurt Shop,Fruit & Vegetable Store,Furniture / Home Store,Gaming Cafe,Garden,Gas Station,Gastropub,Gay Bar,General Entertainment,Gift Shop,Go Kart Track,Golf Course,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Gymnastics Gym,Hakka Restaurant,Halal Restaurant,Hardware Store,Hawaiian Restaurant,Health Food Store,Historic Site,Hobby Shop,Hockey Arena,Hong Kong Restaurant,Hostel,Hotel,Hotel Bar,Hotpot Restaurant,Ice Cream Shop,Indian Restaurant,Indie Movie Theater,Indonesian Restaurant,Intersection,Irish Pub,Italian Restaurant,Japanese Restaurant,Jewelry Store,Jewish Restaurant,Juice Bar,Kitchen Supply Store,Korean Restaurant,Laser Tag,Latin American Restaurant,Leather Goods Store,Lingerie Store,Liquor Store,Lounge,Malay Restaurant,Market,Massage Studio,Medical Center,Mediterranean Restaurant,Men's Store,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Mobile Phone Shop,Monument / Landmark,Movie Theater,Museum,Music School,Music Store,Music Venue,Nail Salon,New American Restaurant,Nightclub,Noodle House,Optical Shop,Organic Grocery,Outdoor Supply Store,Pakistani Restaurant,Paper / Office Supplies Store,Park,Pastry Shop,Performing Arts Venue,Persian Restaurant,Peruvian Restaurant,Pet Service,Pet Store,Pharmacy,Photography Studio,Pide Place,Pier,Pizza Place,Playground,Plaza,Poke Place,Pool,Pool Hall,Portuguese Restaurant,Pub,Racecourse,Racetrack,Ramen Restaurant,Record Shop,Recreation Center,Restaurant,Rock Climbing Spot,Rock Club,Salad Place,Salon / Barbershop,Sandwich Place,Scandinavian Restaurant,Scenic Lookout,School,Science Museum,Seafood Restaurant,Shoe Store,Shopping Mall,Skating Rink,Smoke Shop,Smoothie Shop,Snack Place,Soccer Field,Soccer Stadium,Soup Place,South American Restaurant,Spa,Spanish Restaurant,Speakeasy,Sporting Goods Shop,Sports Bar,Sports Club,Sri Lankan Restaurant,Stables,Stationery Store,Steakhouse,Supermarket,Supplement Shop,Sushi Restaurant,Szechuan Restaurant,Taco Place,Taiwanese Restaurant,Tapas Restaurant,Tattoo Parlor,Tea Room,Tennis Stadium,Thai Restaurant,Theater,Theme Park,Theme Park Ride / Attraction,Theme Restaurant,Thrift / Vintage Store,Tibetan Restaurant,Toy / Game Store,Train Station,Turkish Restaurant,Udon Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,M1B,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.052632,0.052632
1,M1C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.071429,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,M1E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.041667,0.0,0.0,0.0,0.0,0.041667,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.041667,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M1G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M1H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.078947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.026316,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.026316,0.0,0.026316,0.0,0.0


In [213]:
# define a function to find the most common venues in each postcode area

def return_most_common_venues(row, num_top_venues):
 row_categories = row.iloc[1:]
 row_categories_sorted = row_categories.sort_values(ascending=False)
 
 return row_categories_sorted.index.values[0:num_top_venues]


In [215]:

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postcode']
for ind in np.arange(num_top_venues):
 try:
 columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
 except:
 columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
venues_postcodes_sorted = pd.DataFrame(columns=columns)
venues_postcodes_sorted['Postcode'] = venues_postcodes_freq['Postcode']

for ind in np.arange(venues_postcodes_freq.shape[0]):
 venues_postcodes_sorted.iloc[ind, 1:] = return_most_common_venues(venues_postcodes_freq.iloc[ind, :], num_top_venues)

venues_postcodes_sorted.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,Pizza Place,Zoo Exhibit,Grocery Store,Spa,Caribbean Restaurant,Gas Station,Chinese Restaurant,Paper / Office Supplies Store,Liquor Store
1,M1C,Coffee Shop,Italian Restaurant,Liquor Store,Grocery Store,Fast Food Restaurant,Bank,Sandwich Place,Pet Store,Japanese Restaurant,Pharmacy
2,M1E,Pizza Place,Coffee Shop,Fast Food Restaurant,Sandwich Place,Breakfast Spot,Liquor Store,Beer Store,Supermarket,Grocery Store,Gym / Fitness Center
3,M1G,Coffee Shop,Sandwich Place,Supermarket,Pizza Place,Fish & Chips Shop,Event Space,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Filipino Restaurant
4,M1H,Fast Food Restaurant,Coffee Shop,Pizza Place,Pharmacy,Bank,Gas Station,Discount Store,Thai Restaurant,Fried Chicken Joint,Paper / Office Supplies Store


### Cluster the postcodes using k-mean clustering

In [230]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

Toronto_grouped_clustering = venues_postcodes_freq.drop('Postcode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 4, 1, 1, 1, 1, 1, 1])

In [342]:
# add the cluster information and geographic location together
df_cluster = pd.DataFrame({'Postcode':venues_postcodes_freq['Postcode'],'Cluster':kmeans.labels_})
df_new2 = pd.DataFrame(columns = df_new.columns)

for ind, pc in enumerate(df_new['Postcode']):
 
 if pc in venues_postcodes_freq['Postcode'].values:
 #print(ind,pc)
 df_new2 = df_new2.append(df_new.iloc[ind,:]) 

#df_new2.shape
#df_cluster.shape
df_new2 = df_new2.reset_index()
df_new2
df_cluster_postcode = pd.concat([df_new2,df_cluster.iloc[:,1]],axis = 1)

df_cluster_postcode.head()

Unnamed: 0,index,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster
0,0,M1B,Scarborough,"Rouge,Malvern",43.811525,-79.195517,1
1,1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.785665,-79.158725,1
2,2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.765815,-79.175193,1
3,3,M1G,Scarborough,Woburn,43.768369,-79.21759,4
4,4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944,1


Visualize the clustering

In [322]:
# create map
g = geocoder.arcgis('Davisville,Toronto, Ontario')
lat_lng_coords = g.latlng
latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11,control_scale=True)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
#print(ys)
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
#markers_colors = []
for lat, lng, br,pc, nb, cl in zip(df_cluster_postcode['Latitude'], df_cluster_postcode['Longitude'], df_cluster_postcode['Borough'], 
 df_cluster_postcode['Postcode'], df_cluster_postcode['Neighbourhood'], df_cluster_postcode['Cluster']):
 label = folium.Popup('{}, <{} {}>,Cluster {}'.format(nb, br,pc,cl), parse_html=True)
 folium.CircleMarker(
 [lat, lng],
 radius=5,
 popup=label,
 color=rainbow[cl-1],
 fill=True,
 fill_color=rainbow[cl-1],
 fill_opacity=0.7).add_to(map_clusters)
 
map_clusters



#### Cluster A

In [334]:
# show the features of different clusters
index_cl = df_cluster_postcode['Cluster'] == 0
venues_postcodes_sorted.loc[index_cl,:].head(5)

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,M1S,Chinese Restaurant,Coffee Shop,Bakery,Breakfast Spot,Noodle House,Bookstore,Korean Restaurant,Gas Station,Supermarket,Gym / Fitness Center
18,M2K,Coffee Shop,Bank,Café,Clothing Store,Gas Station,Metro Station,Chinese Restaurant,Fast Food Restaurant,Sporting Goods Shop,Fish Market
21,M2N,Coffee Shop,Korean Restaurant,Grocery Store,Pharmacy,Sushi Restaurant,Pizza Place,Japanese Restaurant,Burger Joint,Restaurant,Ramen Restaurant
25,M3B,Japanese Restaurant,Coffee Shop,Pizza Place,Bank,Asian Restaurant,Supermarket,Sandwich Place,Restaurant,Salad Place,Fried Chicken Joint
26,M3C,Coffee Shop,Japanese Restaurant,Pharmacy,Gym,American Restaurant,Supermarket,Sandwich Place,Fast Food Restaurant,Beer Store,Chocolate Shop


#### Cluster B

In [336]:
# show the features of different clusters
index_cl = df_cluster_postcode['Cluster'] == 1
venues_postcodes_sorted.loc[index_cl,:].head(5)

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,Pizza Place,Zoo Exhibit,Grocery Store,Spa,Caribbean Restaurant,Gas Station,Chinese Restaurant,Paper / Office Supplies Store,Liquor Store
1,M1C,Coffee Shop,Italian Restaurant,Liquor Store,Grocery Store,Fast Food Restaurant,Bank,Sandwich Place,Pet Store,Japanese Restaurant,Pharmacy
2,M1E,Pizza Place,Coffee Shop,Fast Food Restaurant,Sandwich Place,Breakfast Spot,Liquor Store,Beer Store,Supermarket,Grocery Store,Gym / Fitness Center
4,M1H,Fast Food Restaurant,Coffee Shop,Pizza Place,Pharmacy,Bank,Gas Station,Discount Store,Thai Restaurant,Fried Chicken Joint,Paper / Office Supplies Store
5,M1J,Sandwich Place,Pharmacy,Coffee Shop,Fast Food Restaurant,Liquor Store,Bowling Alley,Golf Course,Grocery Store,Bookstore,Bank


#### Cluster C

In [338]:
# show the features of different clusters
index_cl = df_cluster_postcode['Cluster'] ==2
venues_postcodes_sorted.loc[index_cl,:].head(5)

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
60,M5L,Gym,Pub,Fish & Chips Shop,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Fish Market


#### Cluster D

In [339]:
# show the features of different clusters
index_cl = df_cluster_postcode['Cluster'] == 3
venues_postcodes_sorted.loc[index_cl,:].head(5)

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
59,M5K,Restaurant,Zoo Exhibit,Fish & Chips Shop,Ethiopian Restaurant,Event Space,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Fish Market


#### Cluster E

In [341]:
# show the features of different clusters
index_cl = df_cluster_postcode['Cluster'] == 4
venues_postcodes_sorted.loc[index_cl,:].head(5)

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,M1G,Coffee Shop,Sandwich Place,Supermarket,Pizza Place,Fish & Chips Shop,Event Space,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Filipino Restaurant
20,M2M,Coffee Shop,Korean Restaurant,Café,Hardware Store,Sandwich Place,Bank,Sushi Restaurant,Juice Bar,Pharmacy,Discount Store
22,M2P,Coffee Shop,French Restaurant,Bank,Restaurant,Sandwich Place,Gym,Deli / Bodega,Historic Site,Food,Flea Market
52,M5A,Coffee Shop,Café,Pub,Bakery,Performing Arts Venue,Breakfast Spot,Mexican Restaurant,Bistro,Furniture / Home Store,French Restaurant
56,M5G,Coffee Shop,Breakfast Spot,Gastropub,Sushi Restaurant,Falafel Restaurant,Farmers Market,Fast Food Restaurant,Filipino Restaurant,Fish & Chips Shop,Fish Market
