Code to scrape the Wikipedia page

In [20]:
import requests
import urllib.request
import time
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [21]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response=requests.get(url)

In [22]:
#Parse HTML
soup=BeautifulSoup(response.text,"html.parser")

In [23]:
table_rows=soup.find_all('tr')

In [24]:
#Cleanse and convert to array
lr=[]
for tr in table_rows:
    td  =tr.find_all('td')
    row =[tr.text for tr in td]
    lr.append(row)

address=[]
for i in lr:
    if len(i)==3:
        i[2]=i[2].replace('\n','')
        address.append(i)
        
address

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", 'Not assigned'],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 ['M9B', 'Etobicoke', 'Islington'],
 ['M9B', 

In [25]:
#Convert into dataframe
ny_df=pd.DataFrame.from_records(address, columns=["Postcode","Borough","Neighborhood"])
ny_df=ny_df.replace(to_replace='None', value=np.nan).dropna()

In [26]:
ny_df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [27]:
ny_df=ny_df.replace(to_replace='Not assigned', value=np.nan)
ny_df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,
9,M8A,,


In [28]:
ny_df=ny_df[pd.notnull(ny_df['Borough'])]
ny_df=ny_df.reset_index(drop=True)
ny_df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [29]:
#Check if Code is Nan, then replaced with Borough
for i in range(len(ny_df.index)):
    if len(str(ny_df['Neighborhood'][i]))==3:
        ny_df['Neighborhood'][i] = ny_df['Borough'][i]

In [30]:
#Group by Postcode and Borough, and join the Neighborhood value
ny_df_grouped=ny_df.groupby(['Postcode','Borough'])['Neighborhood'].apply(','.join)
ny_df_grouped=ny_df_grouped.reset_index()
ny_df_grouped

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [31]:
ny_df_grouped.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [33]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [34]:
merged_df=ny_df_grouped

latitudes=[]
longitude=[]

for i in range(len(merged_df.index)):
    for j in range(len(geoloc_df.index)):
        if merged_df['Postcode'][i]==geoloc_df['Postal Code'][j]:
            latitudes.append(geoloc_df['Latitude'][j])
            longitude.append(geoloc_df['Longitude'][j])

In [35]:
merged_df['Latitude']=latitudes
merged_df['Longitude']=longitude
merged_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [None]:
#Visualising on map
#KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
#splits data
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2, random_state=4)

#modelling
k_neighbor = range (1,25)
scores = {}
score_list = []
for i in k_neighbor:
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    y_pred=knn.predict(X_test)
    scores[i]=metrics.accuracy_score(y_test,y_pred)
    score_list.append(scores[i])

In [None]:
import folium

m = folium.Map([43.706686, -79.38000], zoom_start=11)

# I can add marker one by one on the map
#for i in range(len(merged_df.index)):
#    folium.Marker([[i], [i]], popup=merged_df['Postcode'][i]).add_to(m)

for lat, lng, label in zip(merged_df['Latitude'], merged_df['Longitude'], merged_df['Postcode']):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5, # define how big you want the circle markers to be
        color='yellow',
        fill=True,
        popup=label,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(m)    
    
m