In [23]:
#importing required libraries
import pandas as pd
import numpy as np
import requests

In [76]:
#Creating a dataframe to store the information_table from the html page.
df_scraped = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]

#Readjusting the first row to be the column header
df_scraped.columns = df_scraped.iloc[0]
df_scraped.drop(df_scraped.index[0], inplace=True)
df_scraped.reindex()

df_scraped.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [77]:
#Selecting Bouroughs without the 'Not assigned' ones
df_scraped = df_scraped[~(df_scraped['Borough']=='Not assigned')]
df_scraped.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [79]:
df_scraped['Neighbourhood'] = df_scraped[['Postcode','Borough','Neighbourhood']].groupby(['Postcode'])['Neighbourhood'].transform(lambda x: ','.join(x))
df_scraped = df_scraped[['Postcode','Borough','Neighbourhood']].drop_duplicates()
df_scraped = df_scraped.reset_index(drop=True)
df_scraped.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned


In [83]:
for i,nghhood in enumerate(df_scraped['Neighbourhood']):
    if (nghhood == 'Not assigned'):
        df_scraped['Neighbourhood'][i] = df_scraped['Borough'][i]
    

In [85]:
df_scraped.head(11)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


In [86]:
df_scraped.shape

(103, 3)

In [96]:
#We are reading the data form csv file to a new dataframe for storage 
import pandas as pd
import requests
import io

url = 'http://cocl.us/Geospatial_data'

r = requests.post(url)
if r.ok:
    data = r.content.decode('utf8')
    df_geocoder = pd.read_csv(io.StringIO(data))
df_geocoder.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [102]:
#renaming the columns for merging.
df_geocoder = df_geocoder.rename(columns={'Postal Code':'Postcode'})
df_geocoder.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [105]:
#Merging the two dataframes to get the final dataframe with lat,lon co-ordinates.
df_scraped = pd.merge(df_scraped,df_geocoder,on='Postcode')
df_scraped.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937


In [106]:
df_scraped.shape

(103, 5)

#### Now let's see after trying to get all this data, what does it tell us about the neighbourhoods of Toronto that is hidden from the normal eyesight.

In [107]:
#Let's go ahead and download a couple of libraries for our use in exploratory analysis of the data.

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means for clustering of neighbourhoods
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

print('Libraries have been imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    altair-2.2.2               |           py35_1         462 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    certifi-2018.8.24          |        py35_1001         139 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    openssl-1.0.2r             |       h14c3975_0         3.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.0 MB

The following NEW packages will

In [108]:
df_toronto = df_scraped


Let's draw up a map with the neighbourhoods of the boroughs from the dataframe _df__scraped_ wherein we just gathered and transformed the data for our exploratory analysis.

In [112]:
#Toronto co-ordinates are given below
to_latitude = 43.6532
to_longitude = -79.3832

#Creating map of Toronto using the latitude and longitude values
map_toronto = folium.Map(location=[to_latitude, to_longitude], zoom_start=11)

# adding markers for neighbourhoods on the map
for lat, lng, borough, neighbourhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#31aacc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [115]:
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [122]:
# one hot encoding
torontoonehot = pd.get_dummies(df_toronto[['Borough']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
torontoonehot['Neighbourhood'] = df_toronto['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [torontoonehot.columns[-1]] + list(torontoonehot.columns[:-1])
torontoonehot = torontoonehot[fixed_columns]

torontoonehot.head()

Unnamed: 0,Neighbourhood,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Queen's Park,Scarborough,West Toronto,York
0,Parkwoods,0,0,0,0,0,0,1,0,0,0,0
1,Victoria Village,0,0,0,0,0,0,1,0,0,0,0
2,"Harbourfront,Regent Park",0,1,0,0,0,0,0,0,0,0,0
3,"Lawrence Heights,Lawrence Manor",0,0,0,0,0,0,1,0,0,0,0
4,Queen's Park,0,0,0,0,0,0,0,1,0,0,0


In [120]:
torontoonehot.shape

(103, 12)

In [134]:
torontoonehot = torontoonehot.groupby('Neighbourhood').mean().reset_index()
torontoonehot.head()

Unnamed: 0,Neighbourhood,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Queen's Park,Scarborough,West Toronto,York
0,"Adelaide,King,Richmond",0,1,0,0,0,0,0,0,0,0,0
1,Agincourt,0,0,0,0,0,0,0,0,1,0,0
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",0,0,0,0,0,0,0,0,1,0,0
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",0,0,0,0,1,0,0,0,0,0,0
4,"Alderwood,Long Branch",0,0,0,0,1,0,0,0,0,0,0


In [166]:
#Running k-means to cluster the neighborhoods into 5 clusters.
kclusters = 5

neighbourhood_with_borough = torontoonehot.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(neighbourhood_with_borough)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 2, 2, 4, 4, 3, 3, 3, 0, 2], dtype=int32)

In [167]:
torontoonehot['Cluster'] = kmeans.labels_
torontoonehot.head()

Unnamed: 0,Neighbourhood,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Queen's Park,Scarborough,West Toronto,York,cluster_group,Cluster
0,"Adelaide,King,Richmond",0,1,0,0,0,0,0,0,0,0,0,3,0
1,Agincourt,0,0,0,0,0,0,0,0,1,0,0,2,2
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",0,0,0,0,0,0,0,0,1,0,0,2,2
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",0,0,0,0,1,0,0,0,0,0,0,4,4
4,"Alderwood,Long Branch",0,0,0,0,1,0,0,0,0,0,0,4,4


In [168]:
#Merging with the original dataframe for viewing the clusters created.
final_df_toronto = pd.merge(df_toronto,torontoonehot,on="Neighbourhood")
final_df_toronto.drop(columns=final_df_toronto.columns[5:17], inplace=True)
final_df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster
0,M3A,North York,Parkwoods,43.753259,-79.329656,3
1,M4A,North York,Victoria Village,43.725882,-79.315572,3
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636,0
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763,3
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,1


In [169]:
# create map
map_toronto_clusters = folium.Map(location=[to_latitude,to_longitude], zoom_start=10.5)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# adding borough (cluster) markers to the map
markers_colors = []
for lat, lon, neigh, cluster in zip(final_df_toronto['Latitude'], final_df_toronto['Longitude'], final_df_toronto['Neighbourhood'], final_df_toronto['Cluster']):
    label = folium.Popup(str(neigh) + ' : Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_toronto_clusters)
       
map_toronto_clusters

### Summary:
We can see the neighbourhoods have been assigned to one single cluster based on their 'Boroughs' and the distances from the cluster centroids, which in this case are a total of 5. The neighbourhoods with Boroughs closer to each other show a denser cluster and the ones that are spread far away are put in other clusters that cover the larger ground. 

In the end, it shows that the city of Toronto has higher number of neighbourhoods around the vicinity of the bay-area of the city (the area with toronto city airport). 