## Assignment: Segmenting and Clustering Neighborhoods in Toronto

### Importing required Libraries

In [165]:
import pandas as pd
import numpy as np
import urllib.request,urllib.parse,urllib.error
import re
import requests as r
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

#### Retrieving data from Wiki page

In [166]:
raw_html = r.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', stream=True)

In [167]:
print(raw_html.content[:100])

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title'


#### Using BeautifulSoup package to parse the raw html data

In [168]:
soup = BeautifulSoup(raw_html.content, 'html.parser')

#### Searching for the table from the parsed content

In [20]:
#rows = soup.select('tbody tr')
My_table = soup.find('table',{'class':'wikitable sortable'})

#### Checking content

In [171]:
print (My_table)

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

#### Find table row element from above table 

In [172]:
tr_elements = My_table.find_all('tr')

In [173]:
print (tr_elements)

[<tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>, <tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>, <tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>, <tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>, <tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>, <tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>, <tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>, <tr>
<td>M6A</td>
<td><a href="/wiki/North_York" ti

#### Checking if the size is same across

In [174]:
[len(T) for T in tr_elements[:12]]

[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]

#### Get table headers to store as column names for dataframe

In [175]:
th_elements = My_table.find_all('th')

column_names=[]
for th in th_elements:
        column_names.append(th.get_text(strip=True))
print (column_names)

['Postcode', 'Borough', 'Neighbourhood']


#### Create Dataframe

In [176]:
cd_df=pd.DataFrame(columns=column_names)
#cd_df.columns = cd_df.columns.str.replace(' ', '')

In [177]:
print(cd_df)

Empty DataFrame
Columns: [Postcode, Borough, Neighbourhood]
Index: []


In [178]:
print (len(tr_elements))

289


#### Loop through the table data element to get rows for our dataframe

In [179]:
row=[]
for tr in tr_elements[1:]:
    
    td_element = tr.findAll('td')
   # print(td_element)
    row_strip1=td_element[0].text.strip()
    row_strip1=row_strip1.strip()
    #print (td_element[0])
    row_strip2=td_element[1].text.strip()
    row_strip2=row_strip2.strip()
    #print (td_element[1])
    row_strip3=td_element[2].text.strip()
    row_strip3=row_strip3.strip()
    #print (td_element[2].text.strip())
    #print (row_strip3)
    row_list={'Postcode':row_strip1,'Borough':row_strip2,'Neighbourhood':row_strip3}
    #print (row_list)
    row.append(row_list)
    #print (row)
    cd_df=pd.DataFrame(row)
cd_df.applymap(lambda x: x.strip() if type(x)==str else x)
pd.set_option('display.expand_frame_repr', False)
print (cd_df[['Postcode','Borough','Neighbourhood']])


    Postcode           Borough                                      Neighbourhood
0        M1A      Not assigned                                       Not assigned
1        M2A      Not assigned                                       Not assigned
2        M3A        North York                                          Parkwoods
3        M4A        North York                                   Victoria Village
4        M5A  Downtown Toronto                                       Harbourfront
5        M5A  Downtown Toronto                                        Regent Park
6        M6A        North York                                   Lawrence Heights
7        M6A        North York                                     Lawrence Manor
8        M7A      Queen's Park                                       Not assigned
9        M8A      Not assigned                                       Not assigned
10       M9A         Etobicoke                                   Islington Avenue
11       M1B    

#### Ignore cells with a borough that is Not assigned.

In [180]:
cd_df=cd_df[cd_df['Borough']!='Not assigned']
print (cd_df[['Postcode','Borough','Neighbourhood']])

    Postcode           Borough                                      Neighbourhood
2        M3A        North York                                          Parkwoods
3        M4A        North York                                   Victoria Village
4        M5A  Downtown Toronto                                       Harbourfront
5        M5A  Downtown Toronto                                        Regent Park
6        M6A        North York                                   Lawrence Heights
7        M6A        North York                                     Lawrence Manor
8        M7A      Queen's Park                                       Not assigned
10       M9A         Etobicoke                                   Islington Avenue
11       M1B       Scarborough                                              Rouge
12       M1B       Scarborough                                            Malvern
14       M3B        North York                                    Don Mills North
15       M4B    

#### Find how many rows have Neighbourhood as "Not assigned".

In [181]:
cd_df[cd_df['Neighbourhood']=='Not assigned']

Unnamed: 0,Borough,Neighbourhood,Postcode
8,Queen's Park,Not assigned,M7A


#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.

In [182]:
replace_colval=cd_df['Borough'][cd_df['Neighbourhood']=='Not assigned'].to_list()

cd_df['Neighbourhood'].replace(to_replace=['Not assigned'],value=replace_colval,inplace=True)


#### Check if replacement is done

In [183]:
cd_df[cd_df['Neighbourhood']=='Not assigned']

Unnamed: 0,Borough,Neighbourhood,Postcode


In [184]:
print (cd_df)

              Borough                                      Neighbourhood Postcode
2          North York                                          Parkwoods      M3A
3          North York                                   Victoria Village      M4A
4    Downtown Toronto                                       Harbourfront      M5A
5    Downtown Toronto                                        Regent Park      M5A
6          North York                                   Lawrence Heights      M6A
7          North York                                     Lawrence Manor      M6A
8        Queen's Park                                       Queen's Park      M7A
10          Etobicoke                                   Islington Avenue      M9A
11        Scarborough                                              Rouge      M1B
12        Scarborough                                            Malvern      M1B
14         North York                                    Don Mills North      M3B
15          East

#### More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

In [185]:
#cd_df_grp=cd_df.groupby('Postcode').apply(lambda x: x)
#cd_df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(list)
cd_df_grp=cd_df.groupby(['Postcode','Borough'])['Neighbourhood'].agg(lambda col: ', '.join(col))


#### Check output

In [186]:
print (cd_df_grp)

Postcode  Borough         
M1B       Scarborough                                            Rouge, Malvern
M1C       Scarborough                    Highland Creek, Rouge Hill, Port Union
M1E       Scarborough                         Guildwood, Morningside, West Hill
M1G       Scarborough                                                    Woburn
M1H       Scarborough                                                 Cedarbrae
M1J       Scarborough                                       Scarborough Village
M1K       Scarborough               East Birchmount Park, Ionview, Kennedy Park
M1L       Scarborough                           Clairlea, Golden Mile, Oakridge
M1M       Scarborough           Cliffcrest, Cliffside, Scarborough Village West
M1N       Scarborough                               Birch Cliff, Cliffside West
M1P       Scarborough         Dorset Park, Scarborough Town Centre, Wexford ...
M1R       Scarborough                                         Maryvale, Wexford
M1S       Sca

####  Use the .shape method to print the number of rows of your dataframe

In [187]:
print (cd_df_grp.shape)

(103,)
