# Segmenting and Clustering Neighborhoods in Toronto

#### Generate Dataframe

In [173]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

wiki = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikipedia_page = requests.get(wiki)

df_raw = pd.read_html(wikipedia_page.content, header=0)[0]

df_raw.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Remove Borough that is not assigned 

In [174]:
# Ignore cells with a borough that is Not assigned.

df_new = df_raw[df_raw.Borough != 'Not assigned']

df_new.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Neighborhood that is not assigned to take on name of Borough

In [175]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.


df_new.loc[df_new.Neighborhood == 'Not assigned']
df_new.Neighborhood.replace('Not assigned',df_new.Borough,inplace=True)
df_new.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Combining Neighborhood with the same Postal Code

In [176]:
# Combining neighbourhood with the same Postal Code


df_toronto = df_new.groupby(['Postal Code', 'Borough'])['Neighborhood'].apply(lambda x: ', '.join(x))
df_toronto = df_toronto.reset_index()
df_toronto.rename(columns = {'Neighbourhood':'Neighborhood'}, inplace = True)
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Shape of the df

In [177]:
df_toronto.shape

(103, 3)

## Assumptions made:<br/>
1) Boroughs that are not assigned are removed for analysis<br/>
2) Neighborhoods that are not assigned takes no the name of the borough<br/>
3) Neighbourhoods with the same postal code are combined<br/>
<br/>
**There are 103 unique Postal Code in Toronto**