##  Segmenting and Clusting Neighborhoods in Toronto
### By Kevin Chou

In [338]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np


#create empty array for ca postal data
ca_postal = []
#get html data from site
html = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(html, "html.parser")
#identify table to scrape
ca_table = soup.find_all('table')[0]

#try clause to skip any companies with missing/empty board member tables
try:
#loop through table, grab each of the 3 columns
    for row in ca_table.find_all('tr'):
        cols = row.find_all('td')
        if len(cols) == 3:
            ca_postal.append((cols[0].text.strip(), cols[1].text.strip(), cols[2].text.strip()))
except: pass  


In [339]:
df = pd.DataFrame(ca_postal)
#rename columns, check output
df.columns = ['PostalCode','Borough','Neighborhood']
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [328]:
#Replace 'Not assigned' in Neibourhood with values from Borough
df["Neighbourhood"]= df["Neighbourhood"].replace("Not assigned",df["Borough"] ) 
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
9,M8A,Not assigned,Not assigned


In [340]:
# Delete records where column Borough has 'Not assigned' values
df = df.set_index("Borough")
df = df.drop("Not assigned", axis=0) # Delete all rows with label "Not assigned"

In [341]:
df.head(10)

Unnamed: 0_level_0,PostalCode,Neighborhood
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1
North York,M3A,Parkwoods
North York,M4A,Victoria Village
Downtown Toronto,M5A,Harbourfront
Downtown Toronto,M5A,Regent Park
North York,M6A,Lawrence Heights
North York,M6A,Lawrence Manor
Queen's Park,M7A,Not assigned
Etobicoke,M9A,Islington Avenue
Scarborough,M1B,Rouge
Scarborough,M1B,Malvern


In [331]:
df = df.reset_index()
df.head(10)

Unnamed: 0,Borough,PostalCode,Neighbourhood
0,North York,M3A,Parkwoods
1,North York,M4A,Victoria Village
2,Downtown Toronto,M5A,Harbourfront
3,Downtown Toronto,M5A,Regent Park
4,North York,M6A,Lawrence Heights
5,North York,M6A,Lawrence Manor
6,Queen's Park,M7A,Queen's Park
7,Etobicoke,M9A,Islington Avenue
8,Scarborough,M1B,Rouge
9,Scarborough,M1B,Malvern


In [343]:
#Group by 'Borough','Postcode' and concat the values of Neighbourhood
df= (df.groupby(['Borough','PostalCode'])["Neighborhood"].apply(lambda x: "%s" % ', '.join(x))).reset_index()

In [344]:
df.head()

Unnamed: 0,Borough,PostalCode,Neighborhood
0,Central Toronto,M4N,Lawrence Park
1,Central Toronto,M4P,Davisville North
2,Central Toronto,M4R,North Toronto West
3,Central Toronto,M4S,Davisville
4,Central Toronto,M4T,"Moore Park, Summerhill East"


In [345]:
#Reorder the columns 
df = df[['PostalCode','Borough','Neighborhood']]
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M4N,Central Toronto,Lawrence Park
1,M4P,Central Toronto,Davisville North
2,M4R,Central Toronto,North Toronto West
3,M4S,Central Toronto,Davisville
4,M4T,Central Toronto,"Moore Park, Summerhill East"
5,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi..."
6,M5N,Central Toronto,Roselawn
7,M5P,Central Toronto,"Forest Hill North, Forest Hill West"
8,M5R,Central Toronto,"The Annex, North Midtown, Yorkville"
9,M4W,Downtown Toronto,Rosedale


In [346]:
df.shape

(103, 3)

In [347]:
#export cleaned data set to csv for use in the next lab exercise
df.to_csv('ca_neighbor.csv', encoding='utf-8', index=False)