In [1]:
import pandas as pd
import numpy as np
import requests
import bs4 # I am importing this as part of instructions that I will need BeautifulSoup()

In [2]:
res = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
res.raise_for_status()
soupObject = bs4.BeautifulSoup(res.text, "lxml")

In [3]:
soupObject.select('td')

[<td>M1A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M2A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M3A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td>, <td>M4A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
 </td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
 </td>, <td>M6A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights

In [4]:
# now I have to make a list will all cells in the tables
cells = []

# this is used to get what is contained between all the <td> </td>
for index, value in enumerate(soupObject.select('td')):
    cells.append(str(soupObject.select('td')[index])[4:-5]) # we slice the string first four characters (<td>) and the last five characters (</td>)

# remove lines    
for index, value in enumerate(cells):
    cells[index] = cells[index].rstrip('\n')

# I will keep keep only the title for the Boroughs and Neighbourhoods
for index, value in enumerate(cells):
    cells[index] = cells[index].split('>')
    try:
        cells[index] = cells[index][-2].rstrip('</a')
    except IndexError:
        cells[index] = cells[index]
cells

[['M1A'],
 ['Not assigned'],
 ['Not assigned'],
 ['M2A'],
 ['Not assigned'],
 ['Not assigned'],
 ['M3A'],
 'North York',
 'Parkwoods',
 ['M4A'],
 'North York',
 'Victoria Village',
 ['M5A'],
 'Downtown Toronto',
 'Harbourfront',
 ['M5A'],
 'Downtown Toronto',
 'Regent Park',
 ['M6A'],
 'North York',
 'Lawrence Heights',
 ['M6A'],
 'North York',
 'Lawrence Manor',
 ['M7A'],
 "Queen's Park",
 ['Not assigned'],
 ['M8A'],
 ['Not assigned'],
 ['Not assigned'],
 ['M9A'],
 'Etobicoke',
 'Islington Avenue',
 ['M1B'],
 'Scarborough',
 'Rouge',
 ['M1B'],
 'Scarborough',
 'Malvern',
 ['M2B'],
 ['Not assigned'],
 ['Not assigned'],
 ['M3B'],
 'North York',
 ['Don Mills North'],
 ['M4B'],
 'East York',
 'Woodbine Gardens',
 ['M4B'],
 'East York',
 'Parkview Hill',
 ['M5B'],
 'Downtown Toronto',
 'Ryerson',
 ['M5B'],
 'Downtown Toronto',
 ['Garden District'],
 ['M6B'],
 'North York',
 'Glencairn',
 ['M7B'],
 ['Not assigned'],
 ['Not assigned'],
 ['M8B'],
 ['Not assigned'],
 ['Not assigned'],
 ['M9B']

In [5]:
# this helps me get rid of the sublists
for index, value in enumerate(cells):
    if type(cells[index]) == list:
        cells[index] = cells[index][0]

cells

['M1A',
 'Not assigned',
 'Not assigned',
 'M2A',
 'Not assigned',
 'Not assigned',
 'M3A',
 'North York',
 'Parkwoods',
 'M4A',
 'North York',
 'Victoria Village',
 'M5A',
 'Downtown Toronto',
 'Harbourfront',
 'M5A',
 'Downtown Toronto',
 'Regent Park',
 'M6A',
 'North York',
 'Lawrence Heights',
 'M6A',
 'North York',
 'Lawrence Manor',
 'M7A',
 "Queen's Park",
 'Not assigned',
 'M8A',
 'Not assigned',
 'Not assigned',
 'M9A',
 'Etobicoke',
 'Islington Avenue',
 'M1B',
 'Scarborough',
 'Rouge',
 'M1B',
 'Scarborough',
 'Malvern',
 'M2B',
 'Not assigned',
 'Not assigned',
 'M3B',
 'North York',
 'Don Mills North',
 'M4B',
 'East York',
 'Woodbine Gardens',
 'M4B',
 'East York',
 'Parkview Hill',
 'M5B',
 'Downtown Toronto',
 'Ryerson',
 'M5B',
 'Downtown Toronto',
 'Garden District',
 'M6B',
 'North York',
 'Glencairn',
 'M7B',
 'Not assigned',
 'Not assigned',
 'M8B',
 'Not assigned',
 'Not assigned',
 'M9B',
 'Etobicoke',
 'Cloverdale',
 'M9B',
 'Etobicoke',
 'Islington',
 'M9B',
 

# Now I can create my dataframe

In [6]:
# I will first create an empty dataframe with my headers
column_names = ['Postalcode','Borough','Neighbourhood']
df = pd.DataFrame(columns=column_names)

# we have three values of the list to fill per row of the dataframe
x = 0
for index, value in enumerate(cells):
    try:
        df.loc[index] = [cells[x], cells[x+1], cells[x+2]]
        x = x + 3
    except IndexError:
        break
df

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [7]:
# This will get rid of the postal codes for which the borough are unavailable
for row in range(len(df)):
    if df.loc[row]['Borough'] == 'Not assigned':
        df.drop(row, axis = 0, inplace = True)

In [8]:
# This will put Neighbourhood, Boroughm when Neighbourhood is not assigned
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df['Borough']
df

Unnamed: 0,Postalcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [9]:
# this will combine the neighbourhoods for each postal code
df = df.groupby(['Postalcode','Borough'])['Neighbourhood'].apply(', '.join)
df = pd.DataFrame(df.reset_index(name = ''))
df.columns = ['Postalcode', 'Borough', 'Neighbourhood']
df

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,,,
1,</table,NL,NS
2,AB,BC,NT
3,C,E,G
4,H,J,K
5,L,M,N
6,M1B,Scarborough,"Rouge, Malvern"
7,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
8,M1E,Scarborough,"Guildwood, Morningside, West Hill"
9,M1G,Scarborough,Woburn


# Lets get the shape using .shape

In [10]:
df.shape

(114, 3)

## The data frame is ready. First things first, I am going to get the latitude and longitude for the postal codes

In [11]:
import requests

In [12]:
# I am going to create a new dataframe with the coordinates from the csv file given
url = 'http://cocl.us/Geospatial_data'
df_coords = pd.read_csv(url)
df_coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
# so now that we got the coordinates of each postal code, we can do a join to create 1 dataframe
merged_df = pd.concat([df, df_coords], axis=1, join='inner')
merged_df.drop('Postal Code', axis = 1, inplace = True)
merged_df.head(12)

Unnamed: 0,Postalcode,Borough,Neighbourhood,Latitude,Longitude
0,,,,43.806686,-79.194353
1,</table,NL,NS,43.784535,-79.160497
2,AB,BC,NT,43.763573,-79.188711
3,C,E,G,43.770992,-79.216917
4,H,J,K,43.773136,-79.239476
5,L,M,N,43.744734,-79.239476
6,M1B,Scarborough,"Rouge, Malvern",43.727929,-79.262029
7,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.711112,-79.284577
8,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.716316,-79.239476
9,M1G,Scarborough,Woburn,43.692657,-79.264848


## Part 3: Visualise the clusters

In [14]:
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    altair-2.3.0               |        py36_1001         533 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         631 KB

The following NEW packages will be INSTALLED:

    altair:  2.3.0-py36_1001 conda-forge
    branca:  0.3.1-py_0      conda-forge
    folium:  0.5.0-py_0      conda-forge
    vincent: 0.4.4-py_1      conda-forge


Downloading and Extracting Packages
vincent-0.4.4        |

In [15]:
latitude = 43.806686
longitude = -79.194353
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighbourhood in zip(merged_df['Latitude'], merged_df['Longitude'], merged_df['Borough'], merged_df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='purple',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto