# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
res = BeautifulSoup(html.read(),"html5lib");
print(res.title)

<title>List of postal codes of Canada: M - Wikipedia</title>


### Geting data from Wikipedia

In [2]:
data = []
table = res.find(class_='wikitable sortable')
table

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

In [3]:
rows = table.find_all('tr')
rows

[<tr>
 <th>Postcode</th>
 <th>Borough</th>
 <th>Neighbourhood
 </th></tr>, <tr>
 <td>M1A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M2A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M3A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td></tr>, <tr>
 <td>M4A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td></tr>, <tr>
 <td>M5A</td>
 <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
 <td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
 </td></tr>, <tr>
 <td>M5A</td>
 <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
 <td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
 </td></tr>, <tr>
 <td>M6A</td>
 <td

In [4]:
column_row = rows[0].find_all('th')
column_row

[<th>Postcode</th>, <th>Borough</th>, <th>Neighbourhood
 </th>]

In [5]:
columns = []
for item in column_row:
    columns.append(item.text.strip())
columns

['Postcode', 'Borough', 'Neighbourhood']

In [6]:
raw_data = []
for item in rows:
    raw_data.append(item.find_all('td'))

### Transforming data in order to import to dataframe

In [7]:
data = []
for item in raw_data:
    register = []
    for i in item:
        register.append(i.text.strip())
    data.append(register)

In [8]:
data

[[],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", 'Not assigned'],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 ['M9B', 'Etobicoke', 'Islington'],
 ['M

In [9]:
type(data)

list

### Importing data to pandas dataframe

In [10]:
df = pd.DataFrame(data, columns=columns).dropna()

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 288 entries, 1 to 288
Data columns (total 3 columns):
Postcode         288 non-null object
Borough          288 non-null object
Neighbourhood    288 non-null object
dtypes: object(3)
memory usage: 9.0+ KB


### Setting the filter

In [12]:
filter = (df['Borough']=='Not assigned')

In [13]:
df_filtered = df.loc[filter==False]
df_filtered

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


In [14]:
df_filtered.loc[df_filtered.Neighbourhood == 'Not assigned','Neighbourhood'] = df.Borough

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [15]:
df_filtered.rename(index=str,columns={'Postcode':'PostalCode'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [16]:
list_pc = df_filtered['PostalCode'].unique()

In [17]:
list_br = df_filtered['Borough'].unique()

### Formatting data from column 'Neighbourhood'

In [18]:
lst_register = []
for pc in list_pc:
    register = []
    for br in list_br:
        nb = df_filtered.loc[(df_filtered['PostalCode']==pc) & (df_filtered['Borough']==br)]['Neighbourhood']
        if len(nb) > 0:
            lst_nb = []
            for item in nb:
                lst_nb.append(item)
            string_separated = ",".join(lst_nb)
    lst_register.append([pc,br,string_separated])    

In [19]:
lst_register

[['M3A', 'Mississauga', 'Parkwoods'],
 ['M4A', 'Mississauga', 'Victoria Village'],
 ['M5A', 'Mississauga', 'Harbourfront,Regent Park'],
 ['M6A', 'Mississauga', 'Lawrence Heights,Lawrence Manor'],
 ['M7A', 'Mississauga', "Queen's Park"],
 ['M9A', 'Mississauga', 'Islington Avenue'],
 ['M1B', 'Mississauga', 'Rouge,Malvern'],
 ['M3B', 'Mississauga', 'Don Mills North'],
 ['M4B', 'Mississauga', 'Woodbine Gardens,Parkview Hill'],
 ['M5B', 'Mississauga', 'Ryerson,Garden District'],
 ['M6B', 'Mississauga', 'Glencairn'],
 ['M9B',
  'Mississauga',
  'Cloverdale,Islington,Martin Grove,Princess Gardens,West Deane Park'],
 ['M1C', 'Mississauga', 'Highland Creek,Rouge Hill,Port Union'],
 ['M3C', 'Mississauga', 'Flemingdon Park,Don Mills South'],
 ['M4C', 'Mississauga', 'Woodbine Heights'],
 ['M5C', 'Mississauga', 'St. James Town'],
 ['M6C', 'Mississauga', 'Humewood-Cedarvale'],
 ['M9C',
  'Mississauga',
  'Bloordale Gardens,Eringate,Markland Wood,Old Burnhamthorpe'],
 ['M1E', 'Mississauga', 'Guildwoo

### Result output

In [20]:
df_result = pd.DataFrame(lst_register,columns=['PostalCode','Borough','Neighbourhood'])

In [21]:
df_result

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,Mississauga,Parkwoods
1,M4A,Mississauga,Victoria Village
2,M5A,Mississauga,"Harbourfront,Regent Park"
3,M6A,Mississauga,"Lawrence Heights,Lawrence Manor"
4,M7A,Mississauga,Queen's Park
5,M9A,Mississauga,Islington Avenue
6,M1B,Mississauga,"Rouge,Malvern"
7,M3B,Mississauga,Don Mills North
8,M4B,Mississauga,"Woodbine Gardens,Parkview Hill"
9,M5B,Mississauga,"Ryerson,Garden District"


In [22]:
df_result.shape

(103, 3)