## Segmenting and Clustering Neighborhood in Toronto

## *Data Collection using BeautifulSoup4*

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
table = soup.find("table")

output_rows = []
for table_row in table.findAll('tr'):
    columns = table_row.findAll('td')
    output_row = []
    for column in columns:
        output_row.append(column.text)
    output_rows.append(output_row)

df=pd.DataFrame(output_rows, columns=['Postcode','Borough','Neighborhood'])
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n


In [2]:
df.shape

(288, 3)

## *Data pre-processing* 

In [3]:
df=df.drop(index=0,axis=0)   #remove null row
df=df[df.Borough != 'Not assigned']   #remove rows with unassigned Borough
df.Neighborhood=df.Neighborhood.str.strip('"\n"')  #remove '\n' from string
df=df.reset_index(drop=True)   #reset index

In [4]:
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


In [5]:
df.shape

(210, 3)

In [6]:
ungrouped_df=df

In [7]:
ungrouped_df.shape

(210, 3)

In [8]:
ungrouped_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


Replace Neiborhood with 'Not assigned'

In [9]:
ungrouped_df['Neighborhood'].replace('Not assigned', ungrouped_df['Borough'], inplace=True)

In [10]:
ungrouped_df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Queen's Park,Queen's Park
6,M9A,Downtown Toronto,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [11]:
print("All Neighborhood assigned, e.g. Queen's Park to Queen's Park ")

All Neighborhood assigned, e.g. Queen's Park to Queen's Park 


In [12]:
ungrouped_df.shape

(210, 3)

Grouped by postcode and combining Neighborhood

In [13]:
grouped=ungrouped_df.groupby(["Postcode","Borough"])["Neighborhood"].apply(lambda Neighborhood: ",".join(Neighborhood))


In [14]:
grouped.reset_index()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


In [18]:
type(grouped)

pandas.core.series.Series

In [15]:
grouped_df=pd.DataFrame(grouped).reset_index()  #generate new DF for the grouped 

In [16]:
grouped_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [17]:
grouped_df.shape

(103, 3)