In [1]:
import numpy as np 
import pandas as pd 
from bs4 import BeautifulSoup
import requests

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

In [3]:
class Scrapy:
       
        def parse_url(self, url):
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'lxml')
            return [(self.parse_html_table(table))\
                    for table in soup.find_all('table', class_="wikitable sortable")]  
    
        def parse_html_table(self, table):
            n_columns = 0
            n_rows=0
            column_names = []
            for row in table.find_all('tr'):
                td_tags = row.find_all('td')
                if len(td_tags) > 0:
                    n_rows+=1
                    if n_columns == 0:
                        n_columns = len(td_tags)
                        
                th_tags = row.find_all('th') 
                if len(th_tags) > 0 and len(column_names) == 0:
                    for th in th_tags:
                        column_names.append(th.get_text())
    
            if len(column_names) > 0 and len(column_names) != n_columns:
                raise Exception("Column titles do not match the number of columns")
    
            columns = column_names if len(column_names) > 0 else range(0,n_columns)
            df = pd.DataFrame(columns = columns,
                              index= range(0,n_rows))
            row_marker = 0
            for row in table.find_all('tr'):
                column_marker = 0
                columns = row.find_all('td')
                for column in columns:
                    df.iat[row_marker,column_marker] = column.get_text()
                    column_marker += 1
                if len(columns) > 0:
                    row_marker += 1
                    
            for col in df:
                try:
                    df[col] = df[col].astype(float)
                except ValueError:
                    pass
            
            return df

In [11]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
hp = Scrapy()
table = hp.parse_url(url)[0] 
table.head(10)

table.rename(columns={"Postal Code\n": "Postal Code", "Borough\n": "Borough", "Neighborhood\n" : "Neighborhood"})
table.head(10)


Unnamed: 0,Postal Code\n,Borough\n,Neighborhood\n
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
5,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights\n"
6,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government\n"
7,M8A\n,Not assigned\n,Not assigned\n
8,M9A\n,Etobicoke\n,"Islington Avenue, Humber Valley Village\n"
9,M1B\n,Scarborough\n,"Malvern, Rouge\n"


### Remove the not assigned

In [17]:
table = table[table["Borough\n"] != 'Not assigned']
table.head(10)

Unnamed: 0,Postal Code\n,Borough\n,Neighborhood\n
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [19]:
table = table[table['Neighborhood\n'] != 'Not assigned']
table.head(10)

Unnamed: 0,Postal Code\n,Borough\n,Neighborhood\n
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


### Group neighborhood postcodes

In [23]:
df = table.groupby(['Postal Code\n','Borough\n'])['Neighborhood\n'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
df = df.sample(frac=1).reset_index(drop=True)
df.head(10)

Unnamed: 0,Postal Code\n,Borough\n,Neighborhood\n
0,M2N,North York,"Willowdale, Willowdale East"
1,M8J,Not assigned,Not assigned
2,M2Z,Not assigned,Not assigned
3,M6N,York,"Runnymede, The Junction North"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M8G,Not assigned,Not assigned
6,M5N,Central Toronto,Roselawn
7,M5H,Downtown Toronto,"Richmond, Adelaide, King"
8,M6M,York,"Del Ray, Mount Dennis, Keelsdale and Silverthorn"
9,M4G,East York,Leaside


In [24]:
print(df.shape)

(180, 3)


# PART 2

### Geospacial Data

In [25]:
url2="http://cocl.us/Geospatial_data"
geo_data=pd.read_csv(url2)
geo_data.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [52]:
print(list(df))
print(list(geo_data))

full_table = df.join(geo_data)
full_table.head(20)

full_table = full_table[full_table['Latitude'].notna()]
full_table = full_table[full_table['Longitude'].notna()]
full_table.head(10)

['Postal Code\n', 'Borough\n', 'Neighborhood\n']
['Postal Code', 'Latitude', 'Longitude']


Unnamed: 0,Postal Code\n,Borough\n,Neighborhood\n,Postal Code,Latitude,Longitude
0,M2N,North York,"Willowdale, Willowdale East",M1B,43.806686,-79.194353
1,M8J,Not assigned,Not assigned,M1C,43.784535,-79.160497
2,M2Z,Not assigned,Not assigned,M1E,43.763573,-79.188711
3,M6N,York,"Runnymede, The Junction North",M1G,43.770992,-79.216917
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M1H,43.773136,-79.239476
5,M8G,Not assigned,Not assigned,M1J,43.744734,-79.239476
6,M5N,Central Toronto,Roselawn,M1K,43.727929,-79.262029
7,M5H,Downtown Toronto,"Richmond, Adelaide, King",M1L,43.711112,-79.284577
8,M6M,York,"Del Ray, Mount Dennis, Keelsdale and Silverthorn",M1M,43.716316,-79.239476
9,M4G,East York,Leaside,M1N,43.692657,-79.264848


In [53]:
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium

In [54]:
address = 'Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


  This is separate from the ipykernel package so we can avoid doing imports until


In [56]:
map_geo = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(full_table['Latitude'], full_table['Longitude'], full_table['Neighborhood\n']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_geo)  
    
map_geo