# Segmenting and Clustering Neighborhoods in Toronto

## I. Data manging

In [2]:
import pandas as pd
import numpy as np
import lxml.html as LH

### I.1 Fetch raw data

In [12]:
# scrap data and save the result as a dataframe
url_wiki = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

raw_data = pd.read_html(url_wiki, header=0)[0]
raw_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [13]:
print("raw data shape:", raw_data.shape)

raw data shape: (288, 3)


### I.2 Clean raw data

#### Remove rows when Borough is not assigned

In [29]:
cleaned_data = raw_data.copy(deep=True)
cleaned_data["Borough"] = cleaned_data["Borough"].replace({"Not assigned": np.nan})
cleaned_data = cleaned_data.dropna(axis=0)
cleaned_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


#### Replace Neighbourhood by Borough when Neighbourhood is not assigned

In [30]:
# detect the rows which have unassigned neighbourhoods
mask = (cleaned_data["Neighbourhood"]=="Not assigned")
print("Rows with neighbourhood is not assigned:\n")
print(cleaned_data[mask])

# replace unassigned neighbourhoods by the borough
cleaned_data.loc[mask, "Neighbourhood"] = cleaned_data["Borough"]
print("\nAfter replace:\n")
print(cleaned_data[mask])

Rows with neighbourhood is not assigned:

  Postcode       Borough Neighbourhood
8      M7A  Queen's Park  Not assigned

After replace:

  Postcode       Borough Neighbourhood
8      M7A  Queen's Park  Queen's Park


#### Regroup Neighbourhoods that have the same postal code

In [31]:
cleaned_data = cleaned_data.groupby(by=['Postcode', 'Borough'])["Neighbourhood"].apply(', '.join).reset_index(drop=False)

print("shape of the dataframe after cleaning:", cleaned_data.shape)
cleaned_data.head()

shape of the dataframe after cleaning: (103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
