# Segmenting and Clustering Neighborhoods in Toronto

In [3]:
import pandas as pd
import requests
import lxml.html as lh
from urllib.request import urlopen
from bs4 import BeautifulSoup

Download the url content from wikipedia

In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#page = requests.get(url)
#Store the contents of the website under doc
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')

Use BeautifulSoup library to get the table contents

In [5]:
import re
# Get rows by looking for tr
rows = soup.find_all('tr')

# Get cells
for row in rows:
    row_td = row.find_all('td')
    
str_cells = str(row_td)
cleantext = BeautifulSoup(str_cells, "lxml").get_text()

list_rows = []
for row in rows:
    cells = row.find_all('td')
    str_cells = str(cells)
    clean = re.compile('<.*?>')
    clean2 = (re.sub(clean, '',str_cells))
    list_rows.append(clean2)

Load the table data into a pandas dataframe

In [6]:
# Create a data frame with the "list_rows"
df = pd.DataFrame(list_rows) 
# Expand in cells separated by commas
df = df[0].str.split(',', expand=True)
# Remove '[', ']' and '\n' characters
df[0] = df[0].str.strip('[')
df[2] = df[2].str.strip(']')
df[2] = df[2].str.replace('\n','')

Get Table Header by looking for the 'th' tag

In [7]:
# Get table header
col_labels = soup.find_all('th')
all_header = []
col_str = str(col_labels)
cleantext2 = BeautifulSoup(col_str, "lxml").get_text()
all_header.append(cleantext2)

In [8]:
# Add headers to a new dataframe
df2 = pd.DataFrame(all_header)
# Clean headers DF
df2 = df2[0].str.split(',', expand=True)
df2 = df2.drop([3],axis=1)
df2[0] = df2[0].str.strip('[')
df2[2] = df2[2].str.replace('\n','')

In [9]:
# Concatenate header df (df2) and contents df (df)
frames = [df2, df]
df = pd.concat(frames)
# set firs row as columns header
df = df.rename(columns=df.iloc[0])
# get only with the first three columns
df = df.iloc[:,0:3]
# remove empty spaces
df.columns = df.columns.str.lstrip()
# remove fist row
df=df.drop([0],axis=0)

Remove Not assigned rows

In [10]:
# Remove Not assigned rows
df=df[df.Borough.str.contains('Not assigned')==False]
# Remove empty rows
df=df[df.Postcode.str.contains('\n')==False]

Combine Neighbourhood columns for rows with same postcode

In [11]:
# Combine Neighbourhood columns for rows with same postcode
df=df.groupby('Postcode').agg({'Borough': 'first', 'Neighbourhood': lambda x: ','.join(x)})
df=df.reset_index()

Set neighborhood as borough for the "Not assigned" neighborhoods

In [12]:
# Set neighborhood as borough for the "Not assigned" neighborhoods
df.loc[df.Neighbourhood.str.contains('Not assigned'),'Neighbourhood']=df.loc[df.Neighbourhood.str.contains('Not assigned'),'Borough']

In [13]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [14]:
df.shape

(103, 3)

## Get Postal Code Coordinates

Obtain postal code coordiantes from Geospatial_Coordinates.csv and load it into a dataframe

In [15]:
dfLoc=pd.read_csv('Geospatial_Coordinates.csv')
dfLoc=dfLoc.rename(columns={'Postal Code':'Postcode'})

Merge the two dataframes in a single dataframe by 'Postcode' key

In [16]:
df = pd.merge(df, dfLoc, left_on='Postcode', right_on='Postcode')
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [17]:
df.shape

(103, 5)

# Toronto Clustering

In [18]:
import folium

Get only with the Boroughs containing the word Toronto

In [22]:
torontoData=df[df.Borough.str.contains('Toronto')]

Create a map centered in Toronto, including a circle with the location of each Postal Code and a label with the name of the neighbourhood

In [27]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# add markers to map
for lat, lng, label in zip(torontoData['Latitude'], torontoData['Longitude'], torontoData['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto