# Project

## Phase1 download data and to dataframe

### Step1. use bs4 function to download the table data

In [1]:
# -*- coding: utf-8 -*-
import os,sys
import urllib
import requests 
from urllib.request import urlopen
from bs4 import BeautifulSoup

URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urlopen(URL)
soup = BeautifulSoup(page)
page.close()
 
fp = open("data.csv","w")
tables = soup.findAll('table')
tab = tables[0]
for tr in tab.tbody.findAll('tr'):
    #print(tr.findAll('th'))
    for th in tr.findAll('th'):
        text = th.getText().strip()+','
        fp.write(text)
    for td in tr.findAll('td'):
        text = td.getText().strip()+','
        fp.write(text)
    fp.write('\n')
fp.close()

### Step2. load data to dataframe

In [2]:
import pandas as pd
dfs = pd.read_csv('data.csv')
dfs.drop('Unnamed: 3',axis=1,inplace = True)
dfs.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [3]:
dfs.head()
dfs.shape

(289, 3)

### Step3. remove the rows which "Not assigned" existed in the "Borough" column

In [4]:
dfs1 = dfs[ ~ dfs['Borough'].str.contains('Not assigned')]
dfs1.shape

(212, 3)

In [5]:
dfs1.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Step4. combine the "Neighbourhood"'s values by grouping the "Postcode" & "Borough"

In [6]:
grouped = dfs1.groupby(['Postcode','Borough'],as_index=False)
dfs2 = pd.DataFrame(grouped.sum())
dfs2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,RougeMalvern
1,M1C,Scarborough,Highland CreekRouge HillPort Union
2,M1E,Scarborough,GuildwoodMorningsideWest Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
dfs2.shape

(103, 3)

### Step5. replace the "Not assigned" in 'Neighbourhood' column with 'Borough' column value.

In [8]:
for i in range(len(dfs2)):
    #if dfs2.iloc[i,2] = "Not assigned"
    line_data=dfs2.iloc[i,:]
    if line_data['Neighbourhood'] == 'Not assigned':
        line_data['Neighbourhood'] = line_data['Borough']

In [9]:
dfs2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,RougeMalvern
1,M1C,Scarborough,Highland CreekRouge HillPort Union
2,M1E,Scarborough,GuildwoodMorningsideWest Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [10]:
dfs2.shape

(103, 3)

## Phase2 Segmenting and Clustering Neighborhoods in Toronto City

### using Geospatial_Coordinates.csv to trace dataframe

In [11]:
geo_data = pd.read_csv("Geospatial_Coordinates.csv")
geo_data.rename(columns={'Postal Code':'Postcode'}, inplace = True)
geo_data.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge dfs2 and geo_data by column "Postcode"

In [12]:
dfs3 = pd.merge(dfs2, geo_data, on='Postcode')
dfs3.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,RougeMalvern,43.806686,-79.194353
1,M1C,Scarborough,Highland CreekRouge HillPort Union,43.784535,-79.160497
2,M1E,Scarborough,GuildwoodMorningsideWest Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [13]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(dfs3['Borough'].unique()),
        dfs3.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


### use google to find the latitude & longitude of Toronto of Canada

In [14]:
from geopy.geocoders import Nominatim
address = 'Toronto, CA'

#geolocator = Nominatim()
#location = geolocator.geocode(address)
#latitude = location.latitude
#longitude = location.longitude
latitude=43.653963
longitude=-79.387207
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### import all the function 

In [15]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## create the map

In [53]:
# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(dfs3['Latitude'], dfs3['Longitude'], dfs3['Borough'], dfs3['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_Toronto)  
    
map_Toronto

In [17]:
Scarborough_data = dfs3[dfs3['Borough'] == 'Scarborough'].reset_index(drop=True)
Scarborough_data

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,RougeMalvern,43.806686,-79.194353
1,M1C,Scarborough,Highland CreekRouge HillPort Union,43.784535,-79.160497
2,M1E,Scarborough,GuildwoodMorningsideWest Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,East Birchmount ParkIonviewKennedy Park,43.727929,-79.262029
7,M1L,Scarborough,ClairleaGolden MileOakridge,43.711112,-79.284577
8,M1M,Scarborough,CliffcrestCliffsideScarborough Village West,43.716316,-79.239476
9,M1N,Scarborough,Birch CliffCliffside West,43.692657,-79.264848


## we choose "Scarborough" to do the practice

## get the latitude & longitude from google.

In [18]:
address = 'Scarborough, CA'

#geolocator = Nominatim()
#location = geolocator.geocode(address)
#latitude = location.latitude
#longitude = location.longitude
latitude = 43.773077
longitude = -79.257774
print('The geograpical coordinate of Scarborough are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Scarborough are 43.773077, -79.257774.


## create the map of "Scarborough"

In [19]:
# create map of Scarborough using latitude and longitude values
map_Scarborough = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(Scarborough_data['Latitude'], Scarborough_data['Longitude'], Scarborough_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_Scarborough)  
    
map_Scarborough

## input my foursquare ID

In [20]:
CLIENT_ID = '3CYZT4F5XGUGM1JY1XP0B4XK0XTA2BPMXTO3QROSRMJSSPIF' # your Foursquare ID
CLIENT_SECRET = 'C24OJ1Z0EEKAVUR005PSUFIFEGJKEKTSACUZRSPZDOI3LRWJ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3CYZT4F5XGUGM1JY1XP0B4XK0XTA2BPMXTO3QROSRMJSSPIF
CLIENT_SECRET:C24OJ1Z0EEKAVUR005PSUFIFEGJKEKTSACUZRSPZDOI3LRWJ


## find the index = 0 value to analysis

In [21]:
Scarborough_data.loc[0, 'Neighbourhood']

'RougeMalvern'

In [22]:
Scarborough_data.shape

(17, 5)

## find the Latitude and longitude values of RougeMalvern

In [23]:
neighborhood_latitude = Scarborough_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = Scarborough_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = Scarborough_data.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of RougeMalvern are 43.806686299999996, -79.19435340000001.


## trace the url link to fetch the data from foursquare

In [24]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=3CYZT4F5XGUGM1JY1XP0B4XK0XTA2BPMXTO3QROSRMJSSPIF&client_secret=C24OJ1Z0EEKAVUR005PSUFIFEGJKEKTSACUZRSPZDOI3LRWJ&v=20180605&ll=43.806686299999996,-79.19435340000001&radius=500&limit=100'

## get the json from the url

In [25]:
s = requests.get(url)
results = s.json()

In [26]:
results

{'meta': {'code': 200, 'requestId': '5bed98619fb6b711a328a059'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-4bb6b9446edc76b0d771311c-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/fastfood_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d16e941735',
         'name': 'Fast Food Restaurant',
         'pluralName': 'Fast Food Restaurants',
         'primary': True,
         'shortName': 'Fast Food'}],
       'id': '4bb6b9446edc76b0d771311c',
       'location': {'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'crossStreet': 'Morningside & Sheppard',
        'distance': 387,
        'formattedAddress': ['Toronto ON', 'Canada'],
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.80744841934756,
          'ln

## make a function to get the category's type

In [27]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

## to check how many venues from the url_json

In [28]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Wendy's,Fast Food Restaurant,43.807448,-79.199056


In [29]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

1 venues were returned by Foursquare.


## prepare a function to find the nearby venues

In [30]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [31]:
Scarborough_venues = getNearbyVenues(names=Scarborough_data['Neighbourhood'],
                                   latitudes=Scarborough_data['Latitude'],
                                   longitudes=Scarborough_data['Longitude']
                                  )


RougeMalvern
Highland CreekRouge HillPort Union
GuildwoodMorningsideWest Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount ParkIonviewKennedy Park
ClairleaGolden MileOakridge
CliffcrestCliffsideScarborough Village West
Birch CliffCliffside West
Dorset ParkScarborough Town CentreWexford Heights
MaryvaleWexford
Agincourt
Clarks CornersSullivanTam O'Shanter
Agincourt NorthL'Amoreaux EastMillikenSteeles East
L'Amoreaux WestSteeles West
Upper Rouge


In [56]:
Scarborough_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,RougeMalvern,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,Highland CreekRouge HillPort Union,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,Highland CreekRouge HillPort Union,43.784535,-79.160497,Affordable Toronto Movers,43.787919,-79.162977,Moving Target
3,GuildwoodMorningsideWest Hill,43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,GuildwoodMorningsideWest Hill,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
5,GuildwoodMorningsideWest Hill,43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant
6,GuildwoodMorningsideWest Hill,43.763573,-79.188711,Enterprise Rent-A-Car,43.764042,-79.193371,Rental Car Location
7,GuildwoodMorningsideWest Hill,43.763573,-79.188711,Woburn Medical Centre,43.766631,-79.192286,Medical Center
8,GuildwoodMorningsideWest Hill,43.763573,-79.188711,Eggsmart,43.7678,-79.190466,Breakfast Spot
9,Woburn,43.770992,-79.216917,Starbucks,43.770037,-79.221156,Coffee Shop


## you can't find venues in "Upper Rouge", so we need to remove this row from Scarborough_data

In [33]:
Scarborough_data.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
12,M1S,Scarborough,Agincourt,43.7942,-79.262029
13,M1T,Scarborough,Clarks CornersSullivanTam O'Shanter,43.781638,-79.304302
14,M1V,Scarborough,Agincourt NorthL'Amoreaux EastMillikenSteeles ...,43.815252,-79.284577
15,M1W,Scarborough,L'Amoreaux WestSteeles West,43.799525,-79.318389
16,M1X,Scarborough,Upper Rouge,43.836125,-79.205636


In [34]:
Scarborough_data.drop(index=16,axis=0,inplace=True)
Scarborough_data.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
11,M1R,Scarborough,MaryvaleWexford,43.750072,-79.295849
12,M1S,Scarborough,Agincourt,43.7942,-79.262029
13,M1T,Scarborough,Clarks CornersSullivanTam O'Shanter,43.781638,-79.304302
14,M1V,Scarborough,Agincourt NorthL'Amoreaux EastMillikenSteeles ...,43.815252,-79.284577
15,M1W,Scarborough,L'Amoreaux WestSteeles West,43.799525,-79.318389


In [35]:
print(Scarborough_venues.shape)
Scarborough_venues.head()

(81, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,RougeMalvern,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,Highland CreekRouge HillPort Union,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,Highland CreekRouge HillPort Union,43.784535,-79.160497,Affordable Toronto Movers,43.787919,-79.162977,Moving Target
3,GuildwoodMorningsideWest Hill,43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,GuildwoodMorningsideWest Hill,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


## groupby the venues

In [36]:
Scarborough_venues.groupby('Neighborhood').head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,RougeMalvern,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,Highland CreekRouge HillPort Union,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,Highland CreekRouge HillPort Union,43.784535,-79.160497,Affordable Toronto Movers,43.787919,-79.162977,Moving Target
3,GuildwoodMorningsideWest Hill,43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,GuildwoodMorningsideWest Hill,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
5,GuildwoodMorningsideWest Hill,43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant
6,GuildwoodMorningsideWest Hill,43.763573,-79.188711,Enterprise Rent-A-Car,43.764042,-79.193371,Rental Car Location
7,GuildwoodMorningsideWest Hill,43.763573,-79.188711,Woburn Medical Centre,43.766631,-79.192286,Medical Center
9,Woburn,43.770992,-79.216917,Starbucks,43.770037,-79.221156,Coffee Shop
10,Woburn,43.770992,-79.216917,Tim Hortons,43.770827,-79.223078,Coffee Shop


In [37]:
print('There are {} uniques categories.'.format(len(Scarborough_venues['Venue Category'].unique())))

There are 52 uniques categories.


## make one hot to to the mechine learning

In [38]:
# one hot encoding
Scarborough_onehot = pd.get_dummies(Scarborough_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Scarborough_onehot['Neighborhood'] = Scarborough_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Scarborough_onehot.columns[-1]] + list(Scarborough_onehot.columns[:-1])
Scarborough_onehot = Scarborough_onehot[fixed_columns]

Scarborough_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Breakfast Spot,Bus Line,Bus Station,Café,Caribbean Restaurant,Chinese Restaurant,Clothing Store,Coffee Shop,College Stadium,Cosmetics Shop,Department Store,Discount Store,Electronics Store,Fast Food Restaurant,Fried Chicken Joint,General Entertainment,Grocery Store,Hakka Restaurant,Hobby Shop,Ice Cream Shop,Indian Restaurant,Italian Restaurant,Japanese Restaurant,Korean Restaurant,Latin American Restaurant,Light Rail Station,Lounge,Medical Center,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Motel,Moving Target,Noodle House,Park,Pet Store,Pharmacy,Pizza Place,Playground,Rental Car Location,Sandwich Place,Skating Rink,Soccer Field,Thai Restaurant,Train Station,Vietnamese Restaurant
0,RougeMalvern,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Highland CreekRouge HillPort Union,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Highland CreekRouge HillPort Union,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,GuildwoodMorningsideWest Hill,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,GuildwoodMorningsideWest Hill,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [39]:
Scarborough_onehot.shape

(81, 53)

In [40]:
Scarborough_grouped = Scarborough_onehot.groupby('Neighborhood').mean().reset_index()
Scarborough_grouped

Unnamed: 0,Neighborhood,American Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Breakfast Spot,Bus Line,Bus Station,Café,Caribbean Restaurant,Chinese Restaurant,Clothing Store,Coffee Shop,College Stadium,Cosmetics Shop,Department Store,Discount Store,Electronics Store,Fast Food Restaurant,Fried Chicken Joint,General Entertainment,Grocery Store,Hakka Restaurant,Hobby Shop,Ice Cream Shop,Indian Restaurant,Italian Restaurant,Japanese Restaurant,Korean Restaurant,Latin American Restaurant,Light Rail Station,Lounge,Medical Center,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Motel,Moving Target,Noodle House,Park,Pet Store,Pharmacy,Pizza Place,Playground,Rental Car Location,Sandwich Place,Skating Rink,Soccer Field,Thai Restaurant,Train Station,Vietnamese Restaurant
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0
1,Agincourt NorthL'Amoreaux EastMillikenSteeles ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Birch CliffCliffside West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0
3,Cedarbrae,0.0,0.142857,0.0,0.142857,0.142857,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0
4,ClairleaGolden MileOakridge,0.0,0.0,0.0,0.222222,0.0,0.0,0.0,0.222222,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0
5,Clarks CornersSullivanTam O'Shanter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0
6,CliffcrestCliffsideScarborough Village West,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Dorset ParkScarborough Town CentreWexford Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.142857,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857
8,East Birchmount ParkIonviewKennedy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.125,0.0,0.125,0.0,0.0,0.125,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0
9,GuildwoodMorningsideWest Hill,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
Scarborough_grouped.shape

(16, 53)

## review the top5 venues

In [42]:
num_top_venues = 5

for hood in Scarborough_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Scarborough_grouped[Scarborough_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                 venue  freq
0         Skating Rink  0.25
1       Breakfast Spot  0.25
2               Lounge  0.25
3       Clothing Store  0.25
4  American Restaurant  0.00


----Agincourt NorthL'Amoreaux EastMillikenSteeles East----
                 venue  freq
0           Playground   0.5
1                 Park   0.5
2  American Restaurant   0.0
3        Moving Target   0.0
4  Japanese Restaurant   0.0


----Birch CliffCliffside West----
                   venue  freq
0           Skating Rink  0.25
1  General Entertainment  0.25
2                   Café  0.25
3        College Stadium  0.25
4    American Restaurant  0.00


----Cedarbrae----
                venue  freq
0              Bakery  0.14
1                Bank  0.14
2     Thai Restaurant  0.14
3  Athletics & Sports  0.14
4    Hakka Restaurant  0.14


----ClairleaGolden MileOakridge----
            venue  freq
0          Bakery  0.22
1        Bus Line  0.22
2  Ice Cream Shop  0.11
3    Soccer Field  0.11
4    

## make a function to sort the most common venues

In [43]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)    
    return row_categories_sorted.index.values[0:num_top_venues]

In [44]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Scarborough_grouped['Neighborhood']

for ind in np.arange(Scarborough_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Scarborough_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Skating Rink,Breakfast Spot,Lounge,Clothing Store,Vietnamese Restaurant,Coffee Shop,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant
1,Agincourt NorthL'Amoreaux EastMillikenSteeles ...,Playground,Park,Vietnamese Restaurant,Clothing Store,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store
2,Birch CliffCliffside West,General Entertainment,Skating Rink,Café,College Stadium,Vietnamese Restaurant,Clothing Store,Grocery Store,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
3,Cedarbrae,Hakka Restaurant,Thai Restaurant,Athletics & Sports,Bakery,Bank,Fried Chicken Joint,Caribbean Restaurant,College Stadium,Grocery Store,General Entertainment
4,ClairleaGolden MileOakridge,Bakery,Bus Line,Ice Cream Shop,Soccer Field,Bus Station,Park,Metro Station,Coffee Shop,General Entertainment,Fried Chicken Joint
5,Clarks CornersSullivanTam O'Shanter,Pizza Place,Thai Restaurant,Fried Chicken Joint,Italian Restaurant,Noodle House,Chinese Restaurant,Vietnamese Restaurant,Clothing Store,General Entertainment,Fast Food Restaurant
6,CliffcrestCliffsideScarborough Village West,American Restaurant,Motel,Hobby Shop,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store
7,Dorset ParkScarborough Town CentreWexford Heights,Indian Restaurant,Vietnamese Restaurant,Latin American Restaurant,Light Rail Station,Pet Store,Chinese Restaurant,Coffee Shop,General Entertainment,Fried Chicken Joint,Fast Food Restaurant
8,East Birchmount ParkIonviewKennedy Park,Discount Store,Hobby Shop,Bus Station,Department Store,Chinese Restaurant,Train Station,Coffee Shop,College Stadium,Hakka Restaurant,Grocery Store
9,GuildwoodMorningsideWest Hill,Mexican Restaurant,Electronics Store,Rental Car Location,Pizza Place,Breakfast Spot,Medical Center,Vietnamese Restaurant,General Entertainment,Fried Chicken Joint,Fast Food Restaurant


## use k-mean to do the machine learning

In [58]:
# set number of clusters
kclusters = 3

Scarborough_grouped_clustering = Scarborough_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Scarborough_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0])

## make labels

In [47]:
Scarborough_merged = Scarborough_data

# add clustering labels
Scarborough_merged['Cluster Labels'] = kmeans.labels_
# make the column name the same
Scarborough_merged.rename(columns={'Neighbourhood':'Neighborhood'}, inplace = True)

## merge neighborhoods_venues_sorted to Scarborough_merged by column "Neighborhood"

In [48]:
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Scarborough_merged = Scarborough_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Scarborough_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,RougeMalvern,43.806686,-79.194353,0,Fast Food Restaurant,Vietnamese Restaurant,Train Station,Hakka Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Electronics Store,Discount Store,Department Store
1,M1C,Scarborough,Highland CreekRouge HillPort Union,43.784535,-79.160497,1,Moving Target,Bar,Clothing Store,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store
2,M1E,Scarborough,GuildwoodMorningsideWest Hill,43.763573,-79.188711,0,Mexican Restaurant,Electronics Store,Rental Car Location,Pizza Place,Breakfast Spot,Medical Center,Vietnamese Restaurant,General Entertainment,Fried Chicken Joint,Fast Food Restaurant
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0,Coffee Shop,Korean Restaurant,Vietnamese Restaurant,Hobby Shop,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0,Hakka Restaurant,Thai Restaurant,Athletics & Sports,Bakery,Bank,Fried Chicken Joint,Caribbean Restaurant,College Stadium,Grocery Store,General Entertainment


## create the map 

In [49]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Scarborough_merged['Latitude'], Scarborough_merged['Longitude'], Scarborough_merged['Neighborhood'], Scarborough_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## verify the cluster1

In [50]:
Scarborough_merged.loc[Scarborough_merged['Cluster Labels'] == 0, Scarborough_merged.columns[[1] + list(range(3, Scarborough_merged.shape[1]))]]

Unnamed: 0,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,43.806686,-79.194353,0,Fast Food Restaurant,Vietnamese Restaurant,Train Station,Hakka Restaurant,Grocery Store,General Entertainment,Fried Chicken Joint,Electronics Store,Discount Store,Department Store
2,Scarborough,43.763573,-79.188711,0,Mexican Restaurant,Electronics Store,Rental Car Location,Pizza Place,Breakfast Spot,Medical Center,Vietnamese Restaurant,General Entertainment,Fried Chicken Joint,Fast Food Restaurant
3,Scarborough,43.770992,-79.216917,0,Coffee Shop,Korean Restaurant,Vietnamese Restaurant,Hobby Shop,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store
4,Scarborough,43.773136,-79.239476,0,Hakka Restaurant,Thai Restaurant,Athletics & Sports,Bakery,Bank,Fried Chicken Joint,Caribbean Restaurant,College Stadium,Grocery Store,General Entertainment
5,Scarborough,43.744734,-79.239476,0,Playground,Vietnamese Restaurant,Clothing Store,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store
6,Scarborough,43.727929,-79.262029,0,Discount Store,Hobby Shop,Bus Station,Department Store,Chinese Restaurant,Train Station,Coffee Shop,College Stadium,Hakka Restaurant,Grocery Store
7,Scarborough,43.711112,-79.284577,0,Bakery,Bus Line,Ice Cream Shop,Soccer Field,Bus Station,Park,Metro Station,Coffee Shop,General Entertainment,Fried Chicken Joint
8,Scarborough,43.716316,-79.239476,0,American Restaurant,Motel,Hobby Shop,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store
9,Scarborough,43.692657,-79.264848,0,General Entertainment,Skating Rink,Café,College Stadium,Vietnamese Restaurant,Clothing Store,Grocery Store,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
10,Scarborough,43.75741,-79.273304,0,Indian Restaurant,Vietnamese Restaurant,Latin American Restaurant,Light Rail Station,Pet Store,Chinese Restaurant,Coffee Shop,General Entertainment,Fried Chicken Joint,Fast Food Restaurant


## verify the cluster2

In [51]:
Scarborough_merged.loc[Scarborough_merged['Cluster Labels'] == 1, Scarborough_merged.columns[[1] + list(range(3, Scarborough_merged.shape[1]))]]

Unnamed: 0,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Scarborough,43.784535,-79.160497,1,Moving Target,Bar,Clothing Store,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store
14,Scarborough,43.815252,-79.284577,1,Playground,Park,Vietnamese Restaurant,Clothing Store,Grocery Store,General Entertainment,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store


## verify the cluster3

In [52]:
Scarborough_merged.loc[Scarborough_merged['Cluster Labels'] == 2, Scarborough_merged.columns[[1] + list(range(3, Scarborough_merged.shape[1]))]]

Unnamed: 0,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
13,Scarborough,43.781638,-79.304302,2,Pizza Place,Thai Restaurant,Fried Chicken Joint,Italian Restaurant,Noodle House,Chinese Restaurant,Vietnamese Restaurant,Clothing Store,General Entertainment,Fast Food Restaurant


# That's all.

# Thank you very much.