**First import the required libraries**

In [0]:
import urllib
import pandas as pd
import lxml.html as lh

**Then parse the website and get all data in the table**

In [69]:
# Link to the wiki page and parse data as binary string
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
web_data = urllib.request.urlopen(url).read()


# use lxml library to translate binary 
# string into HTML select the 'wikitable' class table element
doc = lh.fromstring(web_data)
table = doc.find_class('wikitable')

# Get the data in each cell
data = []
for i in range(len(table[0][0])):
  pc = table[0][0][i][0].text_content().replace('\n', '')
  b = table[0][0][i][1].text_content().replace('\n', '')
  nb = table[0][0][i][2].text_content().replace('\n', '')
  # ignore rows where borough is Not assigned
  if b != 'Not assigned':
    # convert 'Not assigned' neighborhoods to their relative borough 
    if nb == 'Not assigned':
      nb = b
    data.append((pc, b, nb))

# remove the first as it is the header row
data.remove(data[0])

# transform data into dataframe
df = pd.DataFrame(data, 
                  columns=['Postal Code', 'Borough', 'Neighborhood'])

# group rows with same postal code and borough
df = df.groupby(['Postal Code', 'Borough'], 
                as_index=False).agg({'Neighborhood':list})
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]
5,M1J,Scarborough,[Scarborough Village]
6,M1K,Scarborough,"[East Birchmount Park, Ionview, Kennedy Park]"
7,M1L,Scarborough,"[Clairlea, Golden Mile, Oakridge]"
8,M1M,Scarborough,"[Cliffcrest, Cliffside, Scarborough Village West]"
9,M1N,Scarborough,"[Birch Cliff, Cliffside West]"


In [49]:
# get the dimension of the dataframe
df.shape

(103, 3)

In [64]:
import geocoder as geo

coor = None

g = geo.google('Canada')
print(g)

<[REQUEST_DENIED] Google - Geocode [empty]>


**As you can see the result above,  I cannot even search for an address as simple as Canada.
I  am using the csv file provided instead.**


In [72]:
# Simply join the 2 dataframes on the Postal Code column
coor_df = pd.read_csv('http://cocl.us/Geospatial_data')
new_df = pd.merge(df, coor_df, on='Postal Code')
new_df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"[Rouge, Malvern]",43.806686,-79.194353
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]",43.784535,-79.160497
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]",43.763573,-79.188711
3,M1G,Scarborough,[Woburn],43.770992,-79.216917
4,M1H,Scarborough,[Cedarbrae],43.773136,-79.239476
5,M1J,Scarborough,[Scarborough Village],43.744734,-79.239476
6,M1K,Scarborough,"[East Birchmount Park, Ionview, Kennedy Park]",43.727929,-79.262029
7,M1L,Scarborough,"[Clairlea, Golden Mile, Oakridge]",43.711112,-79.284577
8,M1M,Scarborough,"[Cliffcrest, Cliffside, Scarborough Village West]",43.716316,-79.239476
9,M1N,Scarborough,"[Birch Cliff, Cliffside West]",43.692657,-79.264848


In [0]:
import folium
from sklearn.cluster import KMeans

In [99]:

# cluster areas into 5 clusters
km = KMeans(n_clusters=5, n_init=12)
feat = new_df[['Latitude','Longitude']]
km.fit(feat)

km

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=12, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [100]:
lbl = km.labels_

new_df['Label'] = lbl

# get the labels and add to the dataframe
cnt = new_df['label'].value_counts()
print(cnt)       

map = folium.Map(location=[43.6969476,	-79.4113072], zoom_start=11)

# plot the areas onto the map
for lat, lng, label, cluster in zip(new_df['Latitude'], new_df['Longitude'], new_df['Postal Code'], new_df['Label']):
    clr = ['blue', 'red', 'orange', 'green', 'black']
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        color=clr[cluster],
        popup=label,
        fill = True,
        fill_color= clr[cluster],
        fill_opacity=0.6
    ).add_to(map)
    
map

0    32
2    24
1    20
4    17
3    10
Name: label, dtype: int64


According to the map, there are more areas closer to the airport. Number of areas decreases when the distance from the airport increases.