## Import the necessary libraries

In [1]:
import pandas as pd
import requests

## Upload the data

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.post(url)
text = r.text

In [3]:
data = pd.read_html(text)
df = pd.concat(data, axis = 1)
dataframe = df.drop([0,1,2, 3, 4,5,6,7,8,9,10,11,12,13,14,15,16,17], axis = 1)

## Drop the rows that are missing data

In [4]:
df= dataframe.dropna()

In [5]:
final_product = df.replace(to_replace = '/', value = ',', regex = True)

In [6]:
final_product.shape

(103, 3)

## Upload the geospatial data

In [7]:
Geo = pd.read_csv('Geospatial_Coordinates (5).csv')
 

## Merge this data with the previously uploaded and cleaned data

In [8]:
two_joined = pd.merge(final_product, Geo)
two_joined

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


## To explore the Toronto area, irrespective of district, I called the ***str.contains*** function to give me the needed information

In [9]:
df = two_joined[two_joined['Borough'].str.contains('Toronto', regex = False)]

In [10]:
df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


## Let's drop the Borough and Postal Code to only analyze the neighborhoods in Toronto

In [11]:
df1 = df.drop(['Postal Code','Borough'], axis = 1)

In [12]:
df1

Unnamed: 0,Neighborhood,Latitude,Longitude
2,"Regent Park, Harbourfront",43.65426,-79.360636
4,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,"Garden District, Ryerson",43.657162,-79.378937
15,St. James Town,43.651494,-79.375418
19,The Beaches,43.676357,-79.293031
20,Berczy Park,43.644771,-79.373306
24,Central Bay Street,43.657952,-79.387383
25,Christie,43.669542,-79.422564
30,"Richmond, Adelaide, King",43.650571,-79.384568
31,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [13]:
df1.shape

(39, 3)

## I called the groupby and count method to see how the neighborhoods are grouped

In [14]:
df1_neigh = df1.groupby('Neighborhood').count()

In [15]:
df1_neigh

Unnamed: 0_level_0,Latitude,Longitude
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1
Berczy Park,1,1
"Brockton, Parkdale Village, Exhibition Place",1,1
Business reply mail Processing Centre,1,1
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",1,1
Central Bay Street,1,1
Christie,1,1
Church and Wellesley,1,1
"Commerce Court, Victoria Hotel",1,1
Davisville,1,1
Davisville North,1,1


## Now that I see that the neighborhoods are all unique, I move on to cluster them

In [16]:
df1_neigh.shape ## I still called the shape method to see if I 
## might be missing something, but the shape of this table is the same as the df1 table, so my 
##assumption is confirmed.

(39, 2)

## Onehot encode the neighborhoods 

In [17]:
df1_onehot = pd.get_dummies(df1[['Neighborhood']])

In [18]:
df1_onehot

Unnamed: 0,Neighborhood_Berczy Park,"Neighborhood_Brockton, Parkdale Village, Exhibition Place",Neighborhood_Business reply mail Processing Centre,"Neighborhood_CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",Neighborhood_Central Bay Street,Neighborhood_Christie,Neighborhood_Church and Wellesley,"Neighborhood_Commerce Court, Victoria Hotel",Neighborhood_Davisville,Neighborhood_Davisville North,...,Neighborhood_St. James Town,"Neighborhood_St. James Town, Cabbagetown",Neighborhood_Stn A PO Boxes,Neighborhood_Studio District,"Neighborhood_Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park","Neighborhood_The Annex, North Midtown, Yorkville",Neighborhood_The Beaches,"Neighborhood_The Danforth West, Riverdale","Neighborhood_Toronto Dominion Centre, Design Exchange","Neighborhood_University of Toronto, Harbord"
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
20,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
df1_onehot.shape

(39, 39)

## Rename the neighborhood from df1 

In [20]:
df1_onehot['Neighborhood'] = df1['Neighborhood']

## And insert it into this new onehot encode data frame

In [21]:
#add neighborhood column to dataframe
fixed_columns = [df1_onehot.columns[-1]] + list(df1_onehot.columns[:-1])

#move the neighborhood column to the first column
df1_onehot = df1_onehot[fixed_columns]

df1_onehot

Unnamed: 0,Neighborhood,Neighborhood_Berczy Park,"Neighborhood_Brockton, Parkdale Village, Exhibition Place",Neighborhood_Business reply mail Processing Centre,"Neighborhood_CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",Neighborhood_Central Bay Street,Neighborhood_Christie,Neighborhood_Church and Wellesley,"Neighborhood_Commerce Court, Victoria Hotel",Neighborhood_Davisville,...,Neighborhood_St. James Town,"Neighborhood_St. James Town, Cabbagetown",Neighborhood_Stn A PO Boxes,Neighborhood_Studio District,"Neighborhood_Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park","Neighborhood_The Annex, North Midtown, Yorkville",Neighborhood_The Beaches,"Neighborhood_The Danforth West, Riverdale","Neighborhood_Toronto Dominion Centre, Design Exchange","Neighborhood_University of Toronto, Harbord"
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Queen's Park, Ontario Provincial Government",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,"Garden District, Ryerson",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,St. James Town,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
19,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
20,Berczy Park,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24,Central Bay Street,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25,Christie,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30,"Richmond, Adelaide, King",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31,"Dufferin, Dovercourt Village",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## I called the ***shape*** funtion to see the columns have increase by one

In [22]:
df1_onehot.shape

(39, 40)

## Import the KMeans library to begin the process of clustering

In [23]:
from sklearn.cluster import KMeans

In [24]:
## Fit the data and print the labels of the cluster
kclusters = 5

df1_clustered = df1_onehot.drop('Neighborhood', axis = 1)

kmeans = KMeans(n_clusters = kclusters, random_state = 1).fit(df1_clustered)

kmeans.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1])

## Insert the labels into the dataframe

In [25]:
# add cluster labels to original dataframe

df.insert(0, 'Cluster Labels', kmeans.labels_)


In [26]:
df

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,0,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,0,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,0,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,0,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,0,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


## Drop the ***Postal Code*** from the data frame

In [27]:
df_with_labels = df.drop('Postal Code', axis = 1)

In [28]:
df_with_labels

Unnamed: 0,Cluster Labels,Borough,Neighborhood,Latitude,Longitude
2,0,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,0,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,0,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,0,Downtown Toronto,St. James Town,43.651494,-79.375418
19,0,East Toronto,The Beaches,43.676357,-79.293031
20,0,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,0,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,0,Downtown Toronto,Christie,43.669542,-79.422564
30,0,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,0,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


## Import the necessary libraries for the map

In [29]:
import folium
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

### To create the map I have to convert the latitude and the longitude of this new dataframe into a list. For some reason folium Map is not accepting name of the dataframe and column. It sees it as a series and is producing an error as a result, so I call the latitude and longitude seperately and convert them into lists before folium Map is able to produce a map with them

In [30]:
df_with_labels['Latitude']

2      43.654260
4      43.662301
9      43.657162
15     43.651494
19     43.676357
20     43.644771
24     43.657952
25     43.669542
30     43.650571
31     43.669005
36     43.640816
37     43.647927
41     43.679557
42     43.647177
43     43.636847
47     43.668999
48     43.648198
54     43.659526
61     43.728020
62     43.711695
67     43.712751
68     43.696948
69     43.661608
73     43.715383
74     43.672710
75     43.648960
79     43.704324
80     43.662696
81     43.651571
83     43.689574
84     43.653206
86     43.686412
87     43.628947
91     43.679563
92     43.646435
96     43.667967
97     43.648429
99     43.665860
100    43.662744
Name: Latitude, dtype: float64

In [31]:
latitude = [43.65426,
 43.662301,
 43.657162,
 43.651494,
 43.676357, 
 43.644771,
 43.657952,
 43.669542,
 43.650571,
 43.669005,
 43.640816,
 43.647927,
 43.679557,
 43.647177,
 43.636847,
 43.668999,
 43.648198,
 43.659526,
 43.72802,
 43.711695,
 43.712751,
 43.696948,
 43.661608,
 43.715383,
 43.67271,
 43.64896,
 43.704324,
 43.662696,
 43.651571,
 43.689574,
 43.653206,
 43.686412,
 43.628947,
 43.679563,
 43.646435,
 43.667967,
 43.648429,
 43.66586,
 43.662744]

In [32]:
df_with_labels['Longitude']

2     -79.360636
4     -79.389494
9     -79.378937
15    -79.375418
19    -79.293031
20    -79.373306
24    -79.387383
25    -79.422564
30    -79.384568
31    -79.442259
36    -79.381752
37    -79.419750
41    -79.352188
42    -79.381576
43    -79.428191
47    -79.315572
48    -79.379817
54    -79.340923
61    -79.388790
62    -79.416936
67    -79.390197
68    -79.411307
69    -79.464763
73    -79.405678
74    -79.405678
75    -79.456325
79    -79.388790
80    -79.400049
81    -79.484450
83    -79.383160
84    -79.400049
86    -79.400049
87    -79.394420
91    -79.377529
92    -79.374846
96    -79.367675
97    -79.382280
99    -79.383160
100   -79.321558
Name: Longitude, dtype: float64

In [33]:
 longitude = [-79.360636, 
-79.389494, 
-79.378937,
-79.375418, 
-79.293031, 
-79.373306, 
-79.387383, 
-79.422564, 
-79.384568, 
-79.442259, 
-79.381752, 
-79.419750, 
-79.352188, 
-79.381576, 
-79.428191, 
-79.315572, 
-79.379817, 
-79.340923, 
-79.388790, 
-79.416936, 
-79.390197, 
-79.411307, 
-79.464763,
-79.405678, 
-79.405678,
-79.456325, 
-79.388790, 
-79.400049, 
-79.484450, 
-79.383160, 
-79.400049, 
-79.400049, 
-79.394420, 
-79.377529, 
-79.374846, 
-79.367675, 
-79.382280, 
-79.383160, 
-79.321558]

In [34]:
# create map


map_clusters = folium.Map(location = [latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters) 
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(latitude, longitude, df_with_labels['Neighborhood'], df_with_labels['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters