# Explore and cluster the neighborhoods in Toronto

## Step One:  
Scrape wikipedia page for the table of Neighborhoods
source page: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [148]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import lxml
import requests
from IPython.display import display
#pd.options.display.max_columns = None
pd.set_option('display.max_columns', 15)
print('Modules Ready')

Modules Ready


In [71]:
wiki = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(wiki, 'lxml')
print(soup.title.text)
print(type(soup))

List of postal codes of Canada: M - Wikipedia
<class 'bs4.BeautifulSoup'>


In [212]:
# Have a quick view of our HTML page

# print(soup.prettify())
# removing this - clutters final notebook with tons of HTML

## Step Two: 
Find the pertinent data from the webpage, and put into a dataframe

In [76]:
# Use soup find the table body
table_body = soup.find('tbody')
#print(table_body)

In [221]:
# found help for this at https://datascience.stackexchange.com/questions/10857/how-to-scrape-a-table-from-a-webpage

In [214]:
# Create empty dataframe to store the data 
rows = table_body.find_all('tr')
df_cols = ['Postcode', 'Borough', 'Neighbourhood']
pc = pd.DataFrame(columns = df_cols)
print('starting with', len(rows), 'rows \n')


# iterate through rows, appending into dataframe
for row in rows:
    cols = row.find_all('td')
    cols = [x.text.strip() for x in cols]
    try:
        # Need a test to catch where no value exists (such as case with first line )
        if cols[0]:
            pc = pc.append({'Postcode':cols[0],
                    'Borough':cols[1],
                    'Neighbourhood':cols[2]}, ignore_index=True)
    except:
        print('hit an empty row, moving on \n')

print('output is a dataframe of this shape: ', pc.shape)
print('before cleaning, dataframe looks like\n\n', pc.head(10))

starting with 289 rows 

hit an empty row, moving on 

output is a dataframe of this shape:  (288, 3)
before cleaning, dataframe looks like

   Postcode           Borough     Neighbourhood
0      M1A      Not assigned      Not assigned
1      M2A      Not assigned      Not assigned
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront
5      M5A  Downtown Toronto       Regent Park
6      M6A        North York  Lawrence Heights
7      M6A        North York    Lawrence Manor
8      M7A      Queen's Park      Not assigned
9      M8A      Not assigned      Not assigned


## Step Three: 
Clean up the dataframe

In [216]:
# First remove rows where Borough is 'Not assigned'
cleanpc = pc[pc.Borough != 'Not assigned']
cleanpc.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


Compare M7A Queen's Park in the above dataframe, and the below

In [215]:
# Next, If a cell has a borough but a Not assigned neighborhood, set neighborhood to same as the borough
cleanpc['Neighbourhood'] = np.where(cleanpc['Neighbourhood'] == 'Not assigned', cleanpc['Borough'], cleanpc['Neighbourhood'])
cleanpc.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [218]:
# Now combine Neighbourhood field when PostCode is the same
grouppc = cleanpc.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()

## Step Four: 
Show the final shape of the dataframe.  Also see the first few rows:

In [219]:
grouppc.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [220]:
grouppc.shape

(103, 3)