# Segmenting and Clustering Neighborhoods in Toronto#

### Scrape the table from wikipedia and transform the data into a pandas dataframe ###

In [111]:
import pandas as pd
import numpy as np

In [116]:
df_toronto = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0)[0]
print(df_toronto.head())

  Postcode           Borough     Neighbourhood
0      M1A      Not assigned      Not assigned
1      M2A      Not assigned      Not assigned
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront


### Replace "Not assigned" with NaN ###

In [117]:
df_toronto.replace("Not assigned", np.nan, inplace = True)
df_toronto


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,
9,M8A,,


### Drop the data with not assigned Borough ###

In [118]:

df_toronto.dropna(subset=["Borough"], axis=0, inplace=True)

# reset index
df_toronto.reset_index(drop=True, inplace=True)
df_toronto.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


### Replace Neighbourhood with Borough if Neighbourhood is Not assigned###

In [120]:
df_toronto["Neighbourhood"].replace(np.nan, df_toronto["Borough"], inplace = True)
print(df_toronto.head())


  Postcode           Borough     Neighbourhood
0      M3A        North York         Parkwoods
1      M4A        North York  Victoria Village
2      M5A  Downtown Toronto      Harbourfront
3      M5A  Downtown Toronto       Regent Park
4      M6A        North York  Lawrence Heights


### combined into one row with the neighborhoods separated with a comma if More than one neighborhood exist in one postal code area ###

In [121]:
df_toronto.dtypes
df_toronto["Neighbourhood"] = df_toronto["Neighbourhood"].astype("str")

In [127]:

df_toronto_clean = df_toronto.groupby(['Postcode','Borough'])['Neighbourhood'].apply(", ".join)
df_toronto_clean = df_toronto_clean.reset_index()
df_toronto_clean.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [128]:
df_toronto_clean.shape

(103, 3)