In [4]:
!conda install -c anaconda beautifulsoup4 --yes
!conda install -c conda-forge geopy --yes
!conda install -c conda-forge folium=0.5.0 --yes

Collecting package metadata: done
Solving environment: | 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/linux-64::anaconda==5.3.1=py37_0
  - defaults/linux-64::astropy==3.0.4=py37h14c3975_0
  - defaults/linux-64::bkcharts==0.2=py37_0
  - defaults/linux-64::blaze==0.11.3=py37_0
  - defaults/linux-64::bokeh==0.13.0=py37_0
  - defaults/linux-64::bottleneck==1.2.1=py37h035aef0_1
  - defaults/linux-64::dask==0.19.1=py37_0
  - defaults/linux-64::datashape==0.5.4=py37_1
  - defaults/linux-64::mkl-service==1.1.2=py37h90e4bf4_5
  - defaults/linux-64::numba==0.39.0=py37h04863e7_0
  - defaults/linux-64::numexpr==2.6.8=py37hd89afb7_0
  - defaults/linux-64::odo==0.5.1=py37_0
  - defaults/linux-64::pytables==3.4.4=py37ha205bf6_0
  - defaults/linux-64::pytest-arraydiff==0.2=py37h39e3cac_0
  - defaults/linux-64::pytest-astropy==0.4.0=py37_0
  - defaults/linux-64::pytest-doctestplus==0.1.3=py37_0
  - defaults

## Installing required Libraries

In [91]:
import numpy as np
import pandas as pd

import requests

from bs4 import BeautifulSoup

from geopy.geocoders import Nominatim

import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

from tqdm import tqdm

## Libraries Import

In [92]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_data = requests.get(url).text
soup = BeautifulSoup(html_data, 'html.parser')

## Scrapping web for HTML data

In [93]:
post_code = []
borough = []
neighborhood = []
for row in tqdm(soup.find('table', {'class' : 'wikitable sortable'}).find_all('tr')):
    columns = row.find_all('td')
    if(len(columns) > 0):
        post_code.append(columns[0].text)
        borough.append(columns[1].text)
        neighborhood.append(columns[2].text.rstrip('\n'))
    
    

100%|██████████| 289/289 [00:00<00:00, 22062.83it/s]


In [94]:
df = pd.DataFrame(data=[post_code, borough, neighborhood])
df = df.T
df.columns = ['PostalCode', 'Borough', 'Neighborhood']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Converting scrapped data to DataFrame

In [95]:
df_dropna = df[df.Borough != 'Not assigned'].reset_index(drop=True) 
df_dropna.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### Removing the Borough with values as 'Not assigned'

In [96]:
df_grouped =df_dropna.groupby(['PostalCode', 'Borough'], as_index=False).agg(lambda x:','.join(x))
df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Grouping neighborhood by postal and borough

In [97]:
df_grouped.loc[df_grouped.Neighborhood == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Not assigned


#### Dealing with 'Not assigned' neighborhood

In [98]:
df_grouped.loc[df_grouped.Neighborhood == 'Not assigned', 'Neighborhood'] = df_grouped.loc[df_grouped.Neighborhood == 'Not assigned', 'Borough']
df_grouped.iloc[85]

PostalCode               M7A
Borough         Queen's Park
Neighborhood    Queen's Park
Name: 85, dtype: object

#### Clean DataFrame

In [99]:
df_clean = df_grouped
df_clean.shape

(103, 3)

## Get location data for Toronto and Borough

#### Read csv file

In [100]:
!wget -q -O "toronto_coordinates.csv" http://cocl.us/Geospatial_data

In [101]:
coordinates = pd.read_csv('toronto_coordinates.csv')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [102]:
df_clean_temp = df_clean.set_index('PostalCode')
coordinates_temp = coordinates.set_index('Postal Code')
coordinates_df = pd.concat([df_clean_temp, coordinates_temp], axis=1, join='inner')
coordinates_df.index.name = 'Postal Code'
coordinates_df.reset_index(inplace=True)
coordinates_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


#### Explore only those Borough that have Toronto in their name

In [142]:
toronto_df = coordinates_df[coordinates_df.Borough.str.contains('Toronto')]
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
