## Segmenting and Clustering Neighborhoods in Toronto - Part 1

##### by Ahmed 

In [1]:
import pandas as pd
import numpy as np

In [2]:
!pip install beautifulsoup4
!pip install lxml
!pip install html5lib
!pip install requests


Requirement not upgraded as not directly required: beautifulsoup4 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages
Requirement not upgraded as not directly required: lxml in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages
Requirement not upgraded as not directly required: html5lib in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages
Requirement not upgraded as not directly required: six in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from html5lib)
Requirement not upgraded as not directly required: webencodings in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from html5lib)
Requirement not upgraded as not directly required: setuptools>=18.5 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from html5lib)
Requirement not upgraded as not directly required: requests in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages
Requirement not upgraded as not directly required: chardet<3.1.0,>=3.0.2 in /opt/conda/envs/DSX-Python3

In [3]:
from bs4 import BeautifulSoup
import requests

In [4]:
# Importing Webpage data with BeautifulSoup

source  = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source, 'lxml')
# print(soup.prettify())        too much data to show on github

In [5]:
# Retrieving table from soup and organizing into a list
table = soup.find('table')
table_raw = table.tbody.text

raw_list = table_raw.splitlines()
raw_list[:20]


['',
 'Postcode',
 'Borough',
 'Neighbourhood',
 '',
 '',
 'M1A',
 'Not assigned',
 'Not assigned',
 '',
 '',
 'M2A',
 'Not assigned',
 'Not assigned',
 '',
 '',
 'M3A',
 'North York',
 'Parkwoods',
 '']

In [6]:
# Removing empty space
i = 0

length = raw_list.count('')
while i < length:
    
    raw_list.remove('')
    i= i + 1
    
raw_list[0:10]

['Postcode',
 'Borough',
 'Neighbourhood',
 'M1A',
 'Not assigned',
 'Not assigned',
 'M2A',
 'Not assigned',
 'Not assigned',
 'M3A']

In [7]:
# Placing data from table into lists for 'Postcode','Borough', and 'Neighbourhood'

col_1_ = [] # Postcode
col_2_ = [] # Borough
col_3_ = [] # Neighbourhood

i = 3
while i < len(raw_list):
    col_1_.append(raw_list[i])
    col_2_.append(raw_list[i+1])
    col_3_.append(raw_list[i+2])
    i = i + 3


In [8]:
# Making a DataFrame with lists
data = {'Postcode':col_1_, 'Borough':col_2_, 'Neighbourhood':col_3_,}
df = pd.DataFrame(data)
df = df[['Postcode','Borough', 'Neighbourhood' ]]
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [9]:
# Retrieving indeces with "Not assigned" in the Borough colomn
index_na = df[df['Borough'].isin(['Not assigned'])].index
index_na = list(index_na)

# Dropping rows with "Not assigned" in the Borough colomn
df_2 = df
df_2 = df_2.drop(index_na)

df_2.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [10]:
# Replacing 'Not assigned' in Queen's Park Borough with the Borough's name 

df_2.loc[8,['Neighbourhood']] = "Queen's Park"
df_2.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [11]:
# Grouping Neighbourhoods in the same Postcode

df3 = df_2.groupby(['Postcode','Borough'], sort = False).agg(lambda x: ', '.join(x))
df3.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Harbourfront, Regent Park"
M6A,North York,"Lawrence Heights, Lawrence Manor"
M7A,Queen's Park,Queen's Park
M9A,Etobicoke,Islington Avenue
M1B,Scarborough,"Rouge, Malvern"
M3B,North York,Don Mills North
M4B,East York,"Woodbine Gardens, Parkview Hill"
M5B,Downtown Toronto,"Ryerson, Garden District"


In [12]:
# Main DataFrame with index reset
df = df3.reset_index()
df.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [13]:
df.shape

(103, 3)