## Assignment: Segmenting and Clustering Neighborhoods in Toronto

### Importing required Libraries

In [2]:
import pandas as pd
import numpy as np
import urllib.request,urllib.parse,urllib.error
import re
import requests as r
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

#### Retrieving data from Wiki page

In [3]:
raw_html = r.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', stream=True)

In [4]:
print(raw_html.content[:100])

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title'


#### Using BeautifulSoup package to parse the raw html data

In [5]:
soup = BeautifulSoup(raw_html.content, 'html.parser')

#### Searching for the table from the parsed content

In [6]:
#rows = soup.select('tbody tr')
My_table = soup.find('table',{'class':'wikitable sortable'})

#### Checking content

In [7]:
print (My_table)

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

#### Find table row element from above table 

In [8]:
tr_elements = My_table.find_all('tr')

In [9]:
print (tr_elements)

[<tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>, <tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>, <tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>, <tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>, <tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>, <tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>, <tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>, <tr>
<td>M6A</td>
<td><a href="/wiki/North_York" ti

#### Checking if the size is same across

In [10]:
[len(T) for T in tr_elements[:12]]

[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]

#### Get table headers to store as column names for dataframe

In [11]:
th_elements = My_table.find_all('th')

column_names=[]
for th in th_elements:
        column_names.append(th.get_text(strip=True))
print (column_names)

['Postcode', 'Borough', 'Neighbourhood']


#### Create Dataframe

In [12]:
cd_df=pd.DataFrame(columns=column_names)
#cd_df.columns = cd_df.columns.str.replace(' ', '')

In [13]:
print(cd_df)

Empty DataFrame
Columns: [Postcode, Borough, Neighbourhood]
Index: []


In [14]:
print (len(tr_elements))

289


#### Loop through the table data element to get rows for our dataframe

In [15]:
row=[]
for tr in tr_elements[1:]:
    
    td_element = tr.findAll('td')
   # print(td_element)
    row_strip1=td_element[0].text.strip()
    row_strip1=row_strip1.strip()
    #print (td_element[0])
    row_strip2=td_element[1].text.strip()
    row_strip2=row_strip2.strip()
    #print (td_element[1])
    row_strip3=td_element[2].text.strip()
    row_strip3=row_strip3.strip()
    #print (td_element[2].text.strip())
    #print (row_strip3)
    row_list={'Postcode':row_strip1,'Borough':row_strip2,'Neighbourhood':row_strip3}
    #print (row_list)
    row.append(row_list)
    #print (row)
    cd_df=pd.DataFrame(row)
cd_df.applymap(lambda x: x.strip() if type(x)==str else x)
pd.set_option('display.expand_frame_repr', False)
print (cd_df[['Postcode','Borough','Neighbourhood']])


    Postcode           Borough                                      Neighbourhood
0        M1A      Not assigned                                       Not assigned
1        M2A      Not assigned                                       Not assigned
2        M3A        North York                                          Parkwoods
3        M4A        North York                                   Victoria Village
4        M5A  Downtown Toronto                                       Harbourfront
5        M5A  Downtown Toronto                                        Regent Park
6        M6A        North York                                   Lawrence Heights
7        M6A        North York                                     Lawrence Manor
8        M7A      Queen's Park                                       Not assigned
9        M8A      Not assigned                                       Not assigned
10       M9A         Etobicoke                                   Islington Avenue
11       M1B    

#### Ignore cells with a borough that is Not assigned.

In [16]:
cd_df=cd_df[cd_df['Borough']!='Not assigned']
print (cd_df[['Postcode','Borough','Neighbourhood']])

    Postcode           Borough                                      Neighbourhood
2        M3A        North York                                          Parkwoods
3        M4A        North York                                   Victoria Village
4        M5A  Downtown Toronto                                       Harbourfront
5        M5A  Downtown Toronto                                        Regent Park
6        M6A        North York                                   Lawrence Heights
7        M6A        North York                                     Lawrence Manor
8        M7A      Queen's Park                                       Not assigned
10       M9A         Etobicoke                                   Islington Avenue
11       M1B       Scarborough                                              Rouge
12       M1B       Scarborough                                            Malvern
14       M3B        North York                                    Don Mills North
15       M4B    

#### Find how many rows have Neighbourhood as "Not assigned".

In [17]:
cd_df[cd_df['Neighbourhood']=='Not assigned']

Unnamed: 0,Borough,Neighbourhood,Postcode
8,Queen's Park,Not assigned,M7A


#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.

In [18]:
replace_colval=cd_df['Borough'][cd_df['Neighbourhood']=='Not assigned'].to_list()

cd_df['Neighbourhood'].replace(to_replace=['Not assigned'],value=replace_colval,inplace=True)


#### Check if replacement is done

In [19]:
cd_df[cd_df['Neighbourhood']=='Not assigned']

Unnamed: 0,Borough,Neighbourhood,Postcode


In [19]:
print (cd_df)

              Borough                                      Neighbourhood Postcode
2          North York                                          Parkwoods      M3A
3          North York                                   Victoria Village      M4A
4    Downtown Toronto                                       Harbourfront      M5A
5    Downtown Toronto                                        Regent Park      M5A
6          North York                                   Lawrence Heights      M6A
7          North York                                     Lawrence Manor      M6A
8        Queen's Park                                       Queen's Park      M7A
10          Etobicoke                                   Islington Avenue      M9A
11        Scarborough                                              Rouge      M1B
12        Scarborough                                            Malvern      M1B
14         North York                                    Don Mills North      M3B
15          East

#### More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

In [20]:
#cd_df_grp=cd_df.groupby('Postcode').apply(lambda x: x)
#cd_df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(list)
cd_df_grp=cd_df.groupby(['Postcode','Borough'])['Neighbourhood'].agg(lambda col: ', '.join(col))


#### Check output

In [21]:
print (cd_df_grp)

Postcode  Borough         
M1B       Scarborough                                            Rouge, Malvern
M1C       Scarborough                    Highland Creek, Rouge Hill, Port Union
M1E       Scarborough                         Guildwood, Morningside, West Hill
M1G       Scarborough                                                    Woburn
M1H       Scarborough                                                 Cedarbrae
M1J       Scarborough                                       Scarborough Village
M1K       Scarborough               East Birchmount Park, Ionview, Kennedy Park
M1L       Scarborough                           Clairlea, Golden Mile, Oakridge
M1M       Scarborough           Cliffcrest, Cliffside, Scarborough Village West
M1N       Scarborough                               Birch Cliff, Cliffside West
M1P       Scarborough         Dorset Park, Scarborough Town Centre, Wexford ...
M1R       Scarborough                                         Maryvale, Wexford
M1S       Sca

####  Use the .shape method to print the number of rows of your dataframe

In [22]:
cd_df_grp=cd_df_grp.reset_index()

In [23]:
print (cd_df_grp.shape)

(103, 3)


In [24]:
print (cd_df_grp.columns)

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')


In [25]:
geoloc=pd.read_csv('http://cocl.us/Geospatial_data/Geospatial_Coordinates.csv')

In [26]:
geoloc.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [27]:
geoloc.shape

(103, 3)

In [28]:
comb_df=pd.merge(cd_df_grp, geoloc, left_on='Postcode', right_on='Postal Code')

In [29]:
print (comb_df)

    Postcode           Borough                                      Neighbourhood Postal Code   Latitude  Longitude
0        M1B       Scarborough                                     Rouge, Malvern         M1B  43.806686 -79.194353
1        M1C       Scarborough             Highland Creek, Rouge Hill, Port Union         M1C  43.784535 -79.160497
2        M1E       Scarborough                  Guildwood, Morningside, West Hill         M1E  43.763573 -79.188711
3        M1G       Scarborough                                             Woburn         M1G  43.770992 -79.216917
4        M1H       Scarborough                                          Cedarbrae         M1H  43.773136 -79.239476
5        M1J       Scarborough                                Scarborough Village         M1J  43.744734 -79.239476
6        M1K       Scarborough        East Birchmount Park, Ionview, Kennedy Park         M1K  43.727929 -79.262029
7        M1L       Scarborough                    Clairlea, Golden Mile,

In [30]:
comb_df.columns

Index(['Postcode', 'Borough', 'Neighbourhood', 'Postal Code', 'Latitude',
       'Longitude'],
      dtype='object')

In [31]:
comb_df=comb_df[['Postal Code','Borough','Neighbourhood','Latitude','Longitude']]

In [237]:
comb_df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


#### Define Foursquare access credentials

In [33]:
CLIENT_ID = '5M4JRG4IBEWCVUQJBLTBLHEPM4JNTXU2S2BDF3WPVDZGH2JX' # your Foursquare ID
CLIENT_SECRET = 'VMXZ40F1DZYKJRXDLM3JW54DACW1LWQXRFNBAGBSSO4QNCOX' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
radius=500
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)


Your credentails:
CLIENT_ID: 5M4JRG4IBEWCVUQJBLTBLHEPM4JNTXU2S2BDF3WPVDZGH2JX
CLIENT_SECRET:VMXZ40F1DZYKJRXDLM3JW54DACW1LWQXRFNBAGBSSO4QNCOX


#### Explore Venues in Neighbourhoods

In [236]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
                    
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Run the above function against our Canadat DF for only Torronto Borough

In [35]:
import requests

In [265]:
comb_df_toronto=[comb_df.loc[comb_df['Borough'].str.contains("Toronto"),'Neighbourhood']][0].reset_index()


In [266]:
comb_df_toronto

Unnamed: 0,index,Neighbourhood
0,37,The Beaches
1,41,"The Danforth West, Riverdale"
2,42,"The Beaches West, India Bazaar"
3,43,Studio District
4,44,Lawrence Park
5,45,Davisville North
6,46,North Toronto West
7,47,Davisville
8,48,"Moore Park, Summerhill East"
9,49,"Deer Park, Forest Hill SE, Rathnelly, South Hi..."


In [267]:
torronto_venues = getNearbyVenues(names=[comb_df.loc[comb_df['Borough'].str.contains("Toronto"),'Neighbourhood']][0],
                                   latitudes=comb_df['Latitude'],
                                   longitudes=comb_df['Longitude']
                                  )
                                                               

In [268]:
torronto_venues_1=torronto_venues['Neighbourhood'].reset_index()

In [269]:
print (comb_df_toronto)
print (torronto_venues_1)

    index                                      Neighbourhood
0      37                                        The Beaches
1      41                       The Danforth West, Riverdale
2      42                     The Beaches West, India Bazaar
3      43                                    Studio District
4      44                                      Lawrence Park
5      45                                   Davisville North
6      46                                 North Toronto West
7      47                                         Davisville
8      48                        Moore Park, Summerhill East
9      49  Deer Park, Forest Hill SE, Rathnelly, South Hi...
10     50                                           Rosedale
11     51                        Cabbagetown, St. James Town
12     52                               Church and Wellesley
13     53                          Harbourfront, Regent Park
14     54                           Ryerson, Garden District
15     55               

#### Missing neighbourhoods in venue data. We will drp in later step.

In [270]:
comb_df_toronto[~comb_df_toronto.Neighbourhood.isin(torronto_venues_1.Neighbourhood)]

Unnamed: 0,index,Neighbourhood
16,56,Berczy Park
21,61,"Commerce Court, Victoria Hotel"


In [38]:
torronto_venues.shape

(269, 7)

#### How many and What type of venues in each set of Neighbourhood

In [273]:
torronto_venues.groupby('Neighbourhood')['Venue Category'].value_counts()

Neighbourhood                                      Venue Category            
Adelaide, King, Richmond                           Clothing Store                5
                                                   Coffee Shop                   4
                                                   American Restaurant           1
                                                   Bakery                        1
                                                   Bank                          1
                                                   Burger Joint                  1
                                                   Candy Store                   1
                                                   Department Store              1
                                                   Electronics Store             1
                                                   Fast Food Restaurant          1
                                                   Food Court                    1
         

#### find any Null values

In [274]:
torronto_venues.isnull().values.any()

False

In [275]:
torronto_venues[['Neighbourhood','Venue Category']]

Unnamed: 0,Neighbourhood,Venue Category
0,The Beaches,Fast Food Restaurant
1,The Beaches,Print Shop
2,"The Danforth West, Riverdale",Construction & Landscaping
3,"The Danforth West, Riverdale",Bar
4,"The Beaches West, India Bazaar",Pizza Place
5,"The Beaches West, India Bazaar",Electronics Store
6,"The Beaches West, India Bazaar",Spa
7,"The Beaches West, India Bazaar",Mexican Restaurant
8,"The Beaches West, India Bazaar",Tech Startup
9,"The Beaches West, India Bazaar",Rental Car Location


#### One Hot encoding

In [276]:
torronto_venues_onehot=pd.get_dummies(torronto_venues[['Venue Category']],prefix="",prefix_sep="")

In [277]:
torronto_venues_onehot['Neighbourhood']=torronto_venues['Neighbourhood']

In [278]:
print (torronto_venues_onehot)

     Airport  American Restaurant  Arts & Crafts Store  Asian Restaurant  Athletics & Sports  Auto Garage  Bakery  Bank  Bar  Baseball Field  ...  Tea Room  Tech Startup  Thai Restaurant  Theater  Thrift / Vintage Store  Toy / Game Store  Trail  Video Store  Vietnamese Restaurant                                      Neighbourhood
0          0                    0                    0                 0                   0            0       0     0    0               0  ...         0             0                0        0                       0                 0      0            0                      0                                        The Beaches
1          0                    0                    0                 0                   0            0       0     0    0               0  ...         0             0                0        0                       0                 0      0            0                      0                                        The Beaches
2   

#### Bring Neighbourhood column upfront

In [279]:
colum_order=[torronto_venues_onehot.columns[-1]] + list(torronto_venues_onehot.columns[:-1])

In [280]:
torronto_venues_onehot=torronto_venues_onehot[colum_order]

In [281]:
print (torronto_venues_onehot)

                                         Neighbourhood  Airport  American Restaurant  Arts & Crafts Store  Asian Restaurant  Athletics & Sports  Auto Garage  Bakery  Bank  Bar  ...  Sushi Restaurant  Tea Room  Tech Startup  Thai Restaurant  Theater  Thrift / Vintage Store  Toy / Game Store  Trail  Video Store  Vietnamese Restaurant
0                                          The Beaches        0                    0                    0                 0                   0            0       0     0    0  ...                 0         0             0                0        0                       0                 0      0            0                      0
1                                          The Beaches        0                    0                    0                 0                   0            0       0     0    0  ...                 0         0             0                0        0                       0                 0      0            0                      

#### Howw many venue of unique type in each Neighbourhood set

In [282]:
torronto_venues_onehot.groupby('Neighbourhood').sum()

Unnamed: 0_level_0,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Baseball Field,...,Sushi Restaurant,Tea Room,Tech Startup,Thai Restaurant,Theater,Thrift / Vintage Store,Toy / Game Store,Trail,Video Store,Vietnamese Restaurant
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Adelaide, King, Richmond",0,1,0,0,0,0,1,1,0,0,...,0,1,0,0,1,0,1,0,0,0
"Brockton, Exhibition Place, Parkdale Village",0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Business Reply Mail Processing Centre 969 Eastern,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Cabbagetown, St. James Town",0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Central Bay Street,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Chinatown, Grange Park, Kensington Market",0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Christie,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Church and Wellesley,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Davisville,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [283]:
tvo=torronto_venues_onehot.groupby('Neighbourhood', as_index=False).mean()

In [284]:
tvo

Unnamed: 0,Neighbourhood,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,...,Sushi Restaurant,Tea Room,Tech Startup,Thai Restaurant,Theater,Thrift / Vintage Store,Toy / Game Store,Trail,Video Store,Vietnamese Restaurant
0,"Adelaide, King, Richmond",0.0,0.033333,0.0,0.0,0.0,0.0,0.033333,0.033333,0.0,...,0.0,0.033333,0.0,0.0,0.033333,0.0,0.033333,0.0,0.0,0.0
1,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0
3,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.0,0.086957,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.142857,0.142857,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Christie,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Church and Wellesley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [285]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


In [287]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = tvo['Neighbourhood']

for ind in np.arange(tvo.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tvo.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Clothing Store,Coffee Shop,Pharmacy,Smoothie Shop,Department Store,Candy Store,Electronics Store,Burger Joint,Fast Food Restaurant,Juice Bar
1,"Brockton, Exhibition Place, Parkdale Village",Athletics & Sports,Gym / Fitness Center,Grocery Store,Liquor Store,Discount Store,Vietnamese Restaurant,Food & Drink Shop,Curling Ice,Deli / Bodega,Department Store
2,Business Reply Mail Processing Centre 969 Eastern,Trail,Health Food Store,Pub,Neighborhood,Vietnamese Restaurant,Fast Food Restaurant,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store
3,"CN Tower, Bathurst Quay, Island airport, Harbo...",Gym,Asian Restaurant,Coffee Shop,Beer Store,Clothing Store,Bike Shop,Restaurant,Bus Line,Sandwich Place,Discount Store
4,"Cabbagetown, St. James Town",Middle Eastern Restaurant,Shopping Mall,Sandwich Place,Breakfast Spot,Auto Garage,Bakery,Electronics Store,Fast Food Restaurant,Falafel Restaurant,Vietnamese Restaurant


In [288]:
neighborhoods_venues_sorted

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Clothing Store,Coffee Shop,Pharmacy,Smoothie Shop,Department Store,Candy Store,Electronics Store,Burger Joint,Fast Food Restaurant,Juice Bar
1,"Brockton, Exhibition Place, Parkdale Village",Athletics & Sports,Gym / Fitness Center,Grocery Store,Liquor Store,Discount Store,Vietnamese Restaurant,Food & Drink Shop,Curling Ice,Deli / Bodega,Department Store
2,Business Reply Mail Processing Centre 969 Eastern,Trail,Health Food Store,Pub,Neighborhood,Vietnamese Restaurant,Fast Food Restaurant,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store
3,"CN Tower, Bathurst Quay, Island airport, Harbo...",Gym,Asian Restaurant,Coffee Shop,Beer Store,Clothing Store,Bike Shop,Restaurant,Bus Line,Sandwich Place,Discount Store
4,"Cabbagetown, St. James Town",Middle Eastern Restaurant,Shopping Mall,Sandwich Place,Breakfast Spot,Auto Garage,Bakery,Electronics Store,Fast Food Restaurant,Falafel Restaurant,Vietnamese Restaurant
5,Central Bay Street,Pool,Golf Course,Dog Run,Mediterranean Restaurant,Vietnamese Restaurant,Fast Food Restaurant,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store
6,"Chinatown, Grange Park, Kensington Market",Japanese Restaurant,Gym / Fitness Center,Caribbean Restaurant,Baseball Field,Café,Vietnamese Restaurant,Food & Drink Shop,Deli / Bodega,Department Store,Dim Sum Restaurant
7,Christie,Airport,Park,Other Repair Shop,Convenience Store,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner,Discount Store
8,Church and Wellesley,Chinese Restaurant,Lounge,Sandwich Place,Breakfast Spot,Vietnamese Restaurant,Food & Drink Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant
9,Davisville,Bus Line,Bakery,Park,Bus Station,Intersection,Soccer Field,Fast Food Restaurant,Metro Station,Electronics Store,Falafel Restaurant


####Run k-means to cluster the neighborhood into 5 clusters.

####Import Libraries

In [290]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

In [291]:
tvo.drop('Neighbourhood',1)

Unnamed: 0,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Baseball Field,...,Sushi Restaurant,Tea Room,Tech Startup,Thai Restaurant,Theater,Thrift / Vintage Store,Toy / Game Store,Trail,Video Store,Vietnamese Restaurant
0,0.0,0.033333,0.0,0.0,0.0,0.0,0.033333,0.033333,0.0,0.0,...,0.0,0.033333,0.0,0.0,0.033333,0.0,0.033333,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0
3,0.0,0.0,0.0,0.086957,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.142857,0.142857,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [292]:
# set number of clusters
kclusters = 5

tvo_cluster = tvo.drop('Neighbourhood',1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tvo_cluster)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 4, 0, 0])

In [293]:
comb_df_toronto=[comb_df.loc[comb_df['Borough'].str.contains("Toronto")]][0]

In [294]:
comb_df_toronto=comb_df_toronto.reset_index(drop=True)

In [295]:
comb_df_toronto

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


#### Drop Missing Neighbourhood from Toronto Data

In [296]:
rem_data=comb_df_toronto[~comb_df_toronto.Neighbourhood.isin(torronto_venues_1.Neighbourhood)]
print (rem_data)
comb_df_toronto=comb_df_toronto[~comb_df_toronto.Neighbourhood.isin(rem_data.Neighbourhood)]
print (comb_df_toronto)

   Postal Code           Borough                   Neighbourhood   Latitude  Longitude
16         M5E  Downtown Toronto                     Berczy Park  43.644771 -79.373306
21         M5L  Downtown Toronto  Commerce Court, Victoria Hotel  43.648198 -79.379817
   Postal Code           Borough                                      Neighbourhood   Latitude  Longitude
0          M4E      East Toronto                                        The Beaches  43.676357 -79.293031
1          M4K      East Toronto                       The Danforth West, Riverdale  43.679557 -79.352188
2          M4L      East Toronto                     The Beaches West, India Bazaar  43.668999 -79.315572
3          M4M      East Toronto                                    Studio District  43.659526 -79.340923
4          M4N   Central Toronto                                      Lawrence Park  43.728020 -79.388790
5          M4P   Central Toronto                                   Davisville North  43.712751 -79.3901

In [297]:
kmeans.labels_

array([0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 1, 0, 3, 0, 0, 4, 4, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0])

In [298]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

tvo_merged = comb_df_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
tvo_merged = tvo_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

tvo_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Print Shop,Fast Food Restaurant,Vietnamese Restaurant,Food & Drink Shop,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Construction & Landscaping,Bar,Food Court,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner,Discount Store,Dog Run
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Pizza Place,Electronics Store,Breakfast Spot,Mexican Restaurant,Spa,Intersection,Tech Startup,Rental Car Location,Medical Center,Dog Run
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Coffee Shop,Korean Restaurant,Convenience Store,Vietnamese Restaurant,Food & Drink Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Bank,Caribbean Restaurant,Hakka Restaurant,Fried Chicken Joint,Athletics & Sports,Thai Restaurant,Bakery,Lounge,Falafel Restaurant,Electronics Store


In [300]:
tvo_merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Print Shop,Fast Food Restaurant,Vietnamese Restaurant,Food & Drink Shop,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Construction & Landscaping,Bar,Food Court,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner,Discount Store,Dog Run
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Pizza Place,Electronics Store,Breakfast Spot,Mexican Restaurant,Spa,Intersection,Tech Startup,Rental Car Location,Medical Center,Dog Run
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Coffee Shop,Korean Restaurant,Convenience Store,Vietnamese Restaurant,Food & Drink Shop,Curling Ice,Deli / Bodega,Department Store,Dim Sum Restaurant,Diner
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Bank,Caribbean Restaurant,Hakka Restaurant,Fried Chicken Joint,Athletics & Sports,Thai Restaurant,Bakery,Lounge,Falafel Restaurant,Electronics Store


####visualize the resulting clusters

In [301]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
address = 'Toronto, CN'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
Latitude = location.latitude
Longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(Latitude, Longitude))

The geograpical coordinate of Toronto are 43.6425637, -79.3870871832047.


In [302]:
# create map
map_clusters = folium.Map(location=[Latitude, Longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tvo_merged['Latitude'], tvo_merged['Longitude'], tvo_merged['Neighbourhood'], tvo_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters