In [12]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

# using BeautifulSoup to get the page content

In [13]:
page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').content
soup = BeautifulSoup(page, 'html.parser')
#soup#.prettify()

# taking the table out of the html content

In [14]:
all_tables=soup.find_all("table")
right_table=soup.find('table', class_='wikitable sortable')
A,B,C=[],[],[]
for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True).replace('\n',''))
        B.append(cells[1].find(text=True).replace('\n',''))
        C.append(cells[2].find(text=True).replace('\n',''))

# creating a dataframe from the page extracted

In [15]:
df=pd.DataFrame()
df['PostalCode']=A
df['Borough']=B
df['Neighborhood']=C
df = df[df['Borough'] != 'Not assigned']
len(df[df['Neighborhood'] == 'Not assigned']) == 0 # every neighborhood is assigned
df = df.set_index(pd.Index(np.arange(len(df))))
df.shape

(103, 3)

In [16]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


# adding latitude and longitude to the dataframe

    # import geocoder # import geocoder

    # initialize your variable to None
lat_lng_coords = None
df['Latitude'] = None
df['Longitude'] = None
    # loop until you get the coordinates
for i,postal_code in enumerate(df['PostalCode']):
    print(i)
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    df['Latitude'].iloc[i] = latitude
    df['Longitude'].iloc[i] = longitude

In [18]:
temp = pd.read_csv('./data/Geospatial_Coordinates.csv')
df['Latitude'] = temp['Latitude']
df['Longitude'] = temp['Longitude']
df

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.806686,-79.194353
1,M4A,North York,Victoria Village,43.784535,-79.160497
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.770992,-79.216917
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.773136,-79.239476
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.706876,-79.518188
99,M4Y,Downtown Toronto,Church and Wellesley,43.696319,-79.532242
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.688905,-79.554724
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.739416,-79.588437


# clustering based on latitude and longitude

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
X = df[['Latitude','Longitude']]
scaler = StandardScaler().fit(X)
X = scaler.transform(X)
kk = KMeans(n_clusters =5, random_state= 42).fit(X)
df['labels'] = kk.labels_
kk.labels_[::7]

array([3, 2, 3, 2, 2, 2, 0, 0, 0, 2, 0, 0, 4, 4, 1], dtype=int32)

In [20]:
import folium
import matplotlib.pyplot as plt

kclusters = 5
c = ['blue','red','yellow','magenta','orange']


geomap = folium.Map(location = [43.6532, -79.3832])

for lat,lon, cluster in zip(df['Latitude'], df['Longitude'], df['labels']):
    folium.CircleMarker([lat, lon],radius=5, color = c[cluster]).add_to(geomap)

In [21]:
geomap