# Segmenting and Clustering Neighborhoods in Toronto - Part 1

* Importing Libraries

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

I am going to use BeautifulSoup to scrape a list of postal codes fro ma Wikipedia Page

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
extracting_data = requests.get(url).text
wiki_data = BeautifulSoup(extracting_data, 'lxml')

Converting content of PostalCode HTML table to a dataframe

In [6]:
columns = ['Postalcode','Borough','Neighborhood']
df_toronto = pd.DataFrame(columns = columns)

content = wiki_data.find('div', class_='mw-parser-output')
table = content.table.tbody
postcode = 0
borough = 0
neighborhood = 0

for tr in table.find_all('tr'):
    i = 0
    for td in tr.find_all('td'):
        if i == 0:
            postcode = td.text
            i = i + 1
        elif i == 1:
            borough = td.text
            i = i + 1
        elif i == 2: 
            neighborhood = td.text.strip('\n').replace(']','')
    df_toronto = df_toronto.append({'Postalcode': postcode,'Borough': borough,'Neighborhood': neighborhood},ignore_index=True)

In [7]:
df_toronto = df_toronto[df_toronto.Borough!='Not assigned']
df_toronto = df_toronto[df_toronto.Borough!= 0]
df_toronto.reset_index(drop = True, inplace = True)
i = 0
for i in range(0,df_toronto.shape[0]):
    if df_toronto.iloc[i][2] == 'Not assigned':
        df_toronto.iloc[i][2] = df_toronto.iloc[i][1]
        i = i+1

In [8]:
df_toronto = df_toronto.groupby(['Postalcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()

Data Cleaning:  Dropping None rows of df_toronto and row which contains 'Not assigned' value | All "Not assigned" will be replace to 'NaN'

In [12]:
df_toronto['Borough'] = df_toronto['Borough'].apply(lambda x: x.replace("\n",""))

df_toronto['Postalcode'] = df_toronto['Postalcode'].apply(lambda x: x.replace("\n",""))

df_toronto['Neighborhood'] = df_toronto['Neighborhood'].apply(lambda x: x.replace("\n",""))

In [15]:
df_toronto = df_toronto[df_toronto['Borough']!='Not assigned']

In [16]:
df_toronto

Unnamed: 0,Postalcode,Borough,Neighborhood
1,M1B,Scarborough,"Malvern, Rouge"
2,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn
5,M1H,Scarborough,Cedarbrae
...,...,...,...
170,M9N,York,Weston
171,M9P,Etobicoke,Westmount
172,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
175,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [18]:
df_toronto.isna().sum()

Postalcode      0
Borough         0
Neighborhood    0
dtype: int64

In [17]:
df_toronto.describe()

Unnamed: 0,Postalcode,Borough,Neighborhood
count,103,103,103
unique,103,10,99
top,M2P,North York,Downsview
freq,1,24,4


In [20]:
def neighborhood_list(grouped):    
    return ', '.join(sorted(grouped['Neighborhood'].tolist()))
                    
grp = df_toronto.groupby(['Postalcode', 'Borough'])
df_2 = grp.apply(neighborhood_list).reset_index(name='Neighborhood')

In [21]:
df_2

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [22]:
df_2.to_csv('Toronto-Danilo.csv', index=False)