# Segmenting and Clustering Neighborhoods in Toronto

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs4
import lxml
import requests
print('Libraries imported.')

Libraries imported.


### Import Wikipedia Article (Part 2)

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = bs4(source, 'lxml')

### Define the header columns (Part 3, Bullet 1)

In [3]:
header = ['PostalCode', 'Borough', 'Neighborhood']
#If pulling from wikipedia article then:
#header = soup.tr.text
#header = header.split()
header

['PostalCode', 'Borough', 'Neighborhood']

### Pull the records from the table in Wikipedia (Part 2)

In [4]:
l_records = []
table_rows = soup.find_all('tr')
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l_records.append(row)
print("First 3 rows: ")
print(l_records[:3])
print("Last 3 rows: ")
print(l_records[-3:])

First 3 rows: 
[[], ['M1A', 'Not assigned', 'Not assigned\n'], ['M2A', 'Not assigned', 'Not assigned\n']]
Last 3 rows: 
[['\n\n\nNL\n\nNS\n\nPE\n\nNB\n\nQC\n\nON\n\nMB\n\nSK\n\nAB\n\nBC\n\nNU/NT\n\nYT\n\n\nA\n\nB\n\nC\n\nE\n\nG\n\nH\n\nJ\n\nK\n\nL\n\nM\n\nN\n\nP\n\nR\n\nS\n\nT\n\nV\n\nX\n\nY\n\n', 'NL\n', 'NS\n', 'PE\n', 'NB\n', 'QC\n', 'ON\n', 'MB\n', 'SK\n', 'AB\n', 'BC\n', 'NU/NT\n', 'YT\n', 'A\n', 'B\n', 'C\n', 'E\n', 'G\n', 'H\n', 'J\n', 'K\n', 'L\n', 'M\n', 'N\n', 'P\n', 'R\n', 'S\n', 'T\n', 'V\n', 'X\n', 'Y\n'], ['NL\n', 'NS\n', 'PE\n', 'NB\n', 'QC\n', 'ON\n', 'MB\n', 'SK\n', 'AB\n', 'BC\n', 'NU/NT\n', 'YT\n'], ['A\n', 'B\n', 'C\n', 'E\n', 'G\n', 'H\n', 'J\n', 'K\n', 'L\n', 'M\n', 'N\n', 'P\n', 'R\n', 'S\n', 'T\n', 'V\n', 'X\n', 'Y\n']]


<b>Remove unnecessary records pulled above.</b>

In [5]:
l_list = l_records[1:-5]
l_list[:5]

[['M1A', 'Not assigned', 'Not assigned\n'],
 ['M2A', 'Not assigned', 'Not assigned\n'],
 ['M3A', 'North York', 'Parkwoods\n'],
 ['M4A', 'North York', 'Victoria Village\n'],
 ['M5A', 'Downtown Toronto', 'Harbourfront\n']]

### Create DataFrame from Data (Part 3 Start)

In [6]:
df_list = pd.DataFrame(l_list, columns = header)
df_list.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [7]:
df_list['Neighborhood'] = df_list['Neighborhood'].map(lambda x: str(x)[:-1])
print("DataFrame has ", len(df_list), "records")
df_list.head()

DataFrame has  289 records


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Remove Records with 'Not Assigned' as Borough (Part 3, Bullet 2)

In [8]:
df_wB = df_list[df_list.Borough != 'Not assigned']
#wB = with Borough
print("DataFrame has ", len(df_wB), " records")
df_wB.head()

DataFrame has  212  records


Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Combine Neighborhoods to Postal Code (Part 3, Bullet 3)

In [9]:
df_pc = df_wB
df_pc2 = df_pc

<B>Grouping Neighborhoods by Postal Code</b>

In [10]:
df_pc3 = df_pc2.groupby('PostalCode')['Neighborhood'].apply(lambda tags: ', '.join(tags))
df_pc3.head()

PostalCode
M1B                            Rouge, Malvern
M1C    Highland Creek, Rouge Hill, Port Union
M1E         Guildwood, Morningside, West Hill
M1G                                    Woburn
M1H                                 Cedarbrae
Name: Neighborhood, dtype: object

<B> Making the series back into a dataframe </b>

In [11]:
df_n = pd.DataFrame({'PostalCode':df_pc3.index, 'Neighborhood':df_pc3.values})
df_n.head()

Unnamed: 0,Neighborhood,PostalCode
0,"Rouge, Malvern",M1B
1,"Highland Creek, Rouge Hill, Port Union",M1C
2,"Guildwood, Morningside, West Hill",M1E
3,Woburn,M1G
4,Cedarbrae,M1H


<b> Combining the two dataframes with the new Neighborhood format </b>

In [12]:
df_combined = pd.DataFrame(df_pc,columns=['PostalCode', 'Borough'])
df_combined.head()

Unnamed: 0,PostalCode,Borough
2,M3A,North York
3,M4A,North York
4,M5A,Downtown Toronto
5,M5A,Downtown Toronto
6,M6A,North York


In [13]:
df_combined = df_combined.drop_duplicates()
df_combined.head()

Unnamed: 0,PostalCode,Borough
2,M3A,North York
3,M4A,North York
4,M5A,Downtown Toronto
6,M6A,North York
8,M7A,Queen's Park


In [14]:
df_n.head()

Unnamed: 0,Neighborhood,PostalCode
0,"Rouge, Malvern",M1B
1,"Highland Creek, Rouge Hill, Port Union",M1C
2,"Guildwood, Morningside, West Hill",M1E
3,Woburn,M1G
4,Cedarbrae,M1H


In [15]:
df = pd.merge(df_combined, df_n, on='PostalCode')

In [17]:
print("DataFrame has ", len(df), " records")
df.head()

DataFrame has  103  records


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned


In [18]:
df.shape

(103, 3)