# Segmenting and Clustering Neighborhoods in the city of Toronto, Canada

For this lab qe will use urllib and BeautifulSoup library, lets import it:

In [2]:
import urllib.request
from bs4 import BeautifulSoup

Now lets define the URL from Wikipedia page with postal codes from Toronto and read it into a variable:

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(url)

parse the HTML from our URL into the BeautifulSoup parse tree format

In [4]:
soup = BeautifulSoup(page, 'lxml')

Lets use the method 'finda_all' to bring back all the instances of tag 'table'

In [5]:
all_tables = soup.find_all('table')
all_tables

[<table class="wikitable sortable">
 <tbody><tr>
 <th>Postal Code
 </th>
 <th>Borough
 </th>
 <th>Neighborhood
 </th></tr>
 <tr>
 <td>M1A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M2A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M3A
 </td>
 <td>North York
 </td>
 <td>Parkwoods
 </td></tr>
 <tr>
 <td>M4A
 </td>
 <td>North York
 </td>
 <td>Victoria Village
 </td></tr>
 <tr>
 <td>M5A
 </td>
 <td>Downtown Toronto
 </td>
 <td>Regent Park, Harbourfront
 </td></tr>
 <tr>
 <td>M6A
 </td>
 <td>North York
 </td>
 <td>Lawrence Manor, Lawrence Heights
 </td></tr>
 <tr>
 <td>M7A
 </td>
 <td>Downtown Toronto
 </td>
 <td>Queen's Park, Ontario Provincial Government
 </td></tr>
 <tr>
 <td>M8A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>
 <tr>
 <td>M9A
 </td>
 <td>Etobicoke
 </td>
 <td>Islington Avenue, Humber Valley Village
 </td></tr>
 <tr>
 <td>M1B
 </td>
 <td>Scarborough
 </td>
 <td>Malvern, Rouge
 </td></tr>
 <tr>
 <td>M2B


We can se that we have more than one table, we can use  the 'table class' to save the correct table in a variable

In [6]:
corr_table = soup.find('table', class_='wikitable sortable')
corr_table

<table class="wikitable sortable">
<tbody><tr>
<th>Postal Code
</th>
<th>Borough
</th>
<th>Neighborhood
</th></tr>
<tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A
</td>
<td>North York
</td>
<td>Parkwoods
</td></tr>
<tr>
<td>M4A
</td>
<td>North York
</td>
<td>Victoria Village
</td></tr>
<tr>
<td>M5A
</td>
<td>Downtown Toronto
</td>
<td>Regent Park, Harbourfront
</td></tr>
<tr>
<td>M6A
</td>
<td>North York
</td>
<td>Lawrence Manor, Lawrence Heights
</td></tr>
<tr>
<td>M7A
</td>
<td>Downtown Toronto
</td>
<td>Queen's Park, Ontario Provincial Government
</td></tr>
<tr>
<td>M8A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M9A
</td>
<td>Etobicoke
</td>
<td>Islington Avenue, Humber Valley Village
</td></tr>
<tr>
<td>M1B
</td>
<td>Scarborough
</td>
<td>Malvern, Rouge
</td></tr>
<tr>
<td>M2B
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3B
</td>
<td>

Now we can read each line of the table and store theis values in lists, one for each column

In [7]:
# defining the empity lists to store the table information
A=[]
B=[]
C=[]

# loop to interact with table's rows
for row in corr_table.find_all('tr'):
    cells = row.find_all('td')
    if len(cells) == 3:
        A.append(cells[0].get_text()[:-1])
        B.append(cells[1].get_text()[:-1])
        C.append(cells[2].get_text()[:-1])

With this we can now generate our pandas DataFrame with the lists that we just scraped from the page

In [8]:
# importinf the pandas library as pd
import pandas as pd

#creating our dataframe
df_tor = pd.DataFrame(A, dtype='str', columns=['Postal Code'])
df_tor['Borough'] = B
df_tor['Neighborhood'] = C

#checking how the df looks like
df_tor.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Lets clean our dataframe and remove all the rows without a Borough assingned

In [9]:
# df shape before cleaning
print('The shape of df before cleaning is:',df_tor.shape)

# colecting the index of the rows
row_erase = df_tor[df_tor['Borough'].str.contains('Not assigned')].index

# droping the rows from df and reset the index
df_tor.drop(row_erase, inplace=True)
df_tor.reset_index(drop=True, inplace=True)

# df shape after cleaning
print('The shape of df after cleaning is:', df_tor.shape)

The shape of df before cleaning is: (180, 3)
The shape of df after cleaning is: (103, 3)


In [10]:
df_tor.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


Lets get the coordinates from each postal code using the geocoder package

In [18]:
# installing the package with pip
!pip install geocoder



In [30]:
# importing the package
import geocoder

# creating the variables to store the data
latitude=[]
longitude=[]

# the loop to interact with all rows in data frame
for postalcode in df_tor['Postal Code']:
    
    print('Colecting coordinates for Postal Code {}'.format(postalcode))
    
    # variable to control the while loop
    lat_long_coord = None
    
    # while loop to guarantie that the result is not none, needs because this package is unreliable
    while (lat_long_coord is None):
        #print('send request')
        g = geocoder.arcgis('{}, Toronto ON'.format(postalcode))
        g.ok
        lat_long_coord = g.latlng
        print(lat_long_coord)
    
    # storing the latitudes and longitudes 
    latitude.append(lat_long_coord[0])
    longitude.append(lat_long_coord[1])

Colecting coordinates for Postal Code M3A
[43.75293455500008, -79.33564142299997]
Colecting coordinates for Postal Code M4A
[43.72810248500008, -79.31188987099995]
Colecting coordinates for Postal Code M5A
[43.65096410900003, -79.35304116399999]
Colecting coordinates for Postal Code M6A
[43.723265465000054, -79.45121077799996]
Colecting coordinates for Postal Code M7A
[43.66179000000005, -79.38938999999993]
Colecting coordinates for Postal Code M9A
[43.66748067300006, -79.52895286499995]
Colecting coordinates for Postal Code M1B
[43.80862623100006, -79.18991284599997]
Colecting coordinates for Postal Code M3B
[43.74890000000005, -79.35721999999998]
Colecting coordinates for Postal Code M4B
[43.70719267700008, -79.31152927299996]
Colecting coordinates for Postal Code M5B
[43.65749059800004, -79.37752923699998]
Colecting coordinates for Postal Code M6B
[43.70727872700007, -79.44750009299997]
Colecting coordinates for Postal Code M9B
[43.65002250300006, -79.55408903099999]
Colecting coord

In [31]:
# including the latitude and longitude to df
df_tor['Latitude'] = latitude
df_tor['Longitude'] = longitude

# check how the df is now
df_tor.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.752935,-79.335641
1,M4A,North York,Victoria Village,43.728102,-79.31189
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667481,-79.528953
6,M1B,Scarborough,"Malvern, Rouge",43.808626,-79.189913
7,M3B,North York,Don Mills,43.7489,-79.35722
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.707193,-79.311529
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657491,-79.377529
