This project will open and modify a dataframe containing information on Canadian neighborhoods.

Import necessary libaries

In [114]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

Scrape the table from URL

In [99]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page_request = requests.get(url).text
soup = BeautifulSoup(page_request, 'lxml')

table = soup.find('table')
data_rows = table.findAll('tr') 
data = [[td.getText().strip() for td in data_rows[i].findAll(['td','th'])]
    for i in range(len(data_rows))]

Create dataframe and clean it

In [100]:
data_df = pd.DataFrame(data,columns=['Postal Code','Borough','Neighborhood']).iloc[1:]
data_df = data_df[data_df.Borough != 'Not assigned']

Print the shape of dataframe

In [101]:
print(data_df.shape)

(103, 3)


Retrieve Latitude and longitude for neighborhoods

In [102]:
latlong = pd.read_csv('Geospatial_Coordinates.csv')

data_df = data_df.merge(latlong,on='Postal Code', how='left')
print(data_df)

    Postal Code           Borough  \
0           M3A        North York   
1           M4A        North York   
2           M5A  Downtown Toronto   
3           M6A        North York   
4           M7A  Downtown Toronto   
5           M9A         Etobicoke   
6           M1B       Scarborough   
7           M3B        North York   
8           M4B         East York   
9           M5B  Downtown Toronto   
10          M6B        North York   
11          M9B         Etobicoke   
12          M1C       Scarborough   
13          M3C        North York   
14          M4C         East York   
15          M5C  Downtown Toronto   
16          M6C              York   
17          M9C         Etobicoke   
18          M1E       Scarborough   
19          M4E      East Toronto   
20          M5E  Downtown Toronto   
21          M6E              York   
22          M1G       Scarborough   
23          M4G         East York   
24          M5G  Downtown Toronto   
25          M6G  Downtown Toronto   
2

Remove neighborhoods that don't include "Toronto" in string

In [103]:
data_df = data_df[data_df['Borough'].str.contains('Toronto',regex=False)]
print(data_df)

    Postal Code           Borough  \
2           M5A  Downtown Toronto   
4           M7A  Downtown Toronto   
9           M5B  Downtown Toronto   
15          M5C  Downtown Toronto   
19          M4E      East Toronto   
20          M5E  Downtown Toronto   
24          M5G  Downtown Toronto   
25          M6G  Downtown Toronto   
30          M5H  Downtown Toronto   
31          M6H      West Toronto   
36          M5J  Downtown Toronto   
37          M6J      West Toronto   
41          M4K      East Toronto   
42          M5K  Downtown Toronto   
43          M6K      West Toronto   
47          M4L      East Toronto   
48          M5L  Downtown Toronto   
54          M4M      East Toronto   
61          M4N   Central Toronto   
62          M5N   Central Toronto   
67          M4P   Central Toronto   
68          M5P   Central Toronto   
69          M6P      West Toronto   
73          M4R   Central Toronto   
74          M5R   Central Toronto   
75          M6R      West Toronto   
7

In [106]:
kclusters = 5

clusters = data_df.drop(['Postal Code','Borough','Neighborhood'], 1)
kmeans = KMeans(n_clusters = kclusters, random_state=0).fit(clusters)
kmeans.labels_
data_df.insert(0, 'ClusterLabels', kmeans.labels_)

print(data_df)

     ClusterLabels Postal Code           Borough  \
2                0         M5A  Downtown Toronto   
4                0         M7A  Downtown Toronto   
9                0         M5B  Downtown Toronto   
15               0         M5C  Downtown Toronto   
19               4         M4E      East Toronto   
20               0         M5E  Downtown Toronto   
24               0         M5G  Downtown Toronto   
25               3         M6G  Downtown Toronto   
30               0         M5H  Downtown Toronto   
31               1         M6H      West Toronto   
36               0         M5J  Downtown Toronto   
37               3         M6J      West Toronto   
41               4         M4K      East Toronto   
42               0         M5K  Downtown Toronto   
43               3         M6K      West Toronto   
47               4         M4L      East Toronto   
48               0         M5L  Downtown Toronto   
54               4         M4M      East Toronto   
61          

In [113]:
map_clusters = folium.Map(location=[43.651070,-79.347015], zoom_start=11)


x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, neighbourhood, cluster in zip(data_df['Latitude'], data_df['Longitude'], data_df['Neighborhood'], data_df['ClusterLabels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius = 4,
        popup = label,
        color = rainbow[cluster-1],
        fill = True,
        fill_color = rainbow[cluster-1],
        fill_opacity = 0.5).add_to(map_clusters)
       
map_clusters