# Segmenting and Clustering Neighborhoods in Toronto

## 1. Importing python packages 

In [41]:
import numpy  as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
print('1. python packages imported')

1. python packages imported


## 2. Extracting list of postal codes page from Wikipedia

In [42]:
page_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(page_url)
html_doc = response.text
soup_doc = BeautifulSoup(html_doc, 'lxml')
print('2. html page extracted')

2. html page extracted


## 3. Extracting and scrapping the list of postal codes table from HTML page

I have used 'wikitable sortable' class to find and extract the list of postal codes from HTML Wikipedia page. I have supposed which there is only one table labeled with this class.

In [43]:
neigh_html_table = soup_doc.find('table', {'class': 'wikitable sortable'})
print('3.1. neighborhood table extracted')

# build neighborhoods dataframe.
neighborhoods = []
for row in neigh_html_table.find_all('tr'):
        cols = row.find_all('td')
        if len(cols) == 3:
            neighborhoods.append((cols[0].text.strip(), cols[1].text.strip(), cols[2].text.strip()))
neigh_df = pd.DataFrame(np.asarray(neighborhoods))
neigh_df.columns = ['PostCode', 'Borough', 'Neighborhood']
print('3.2. neighborhoods dataframe built.')


3.1. neighborhood table extracted
3.2. neighborhoods dataframe built.


## 4. Filtering out 'Not assigned' Boroughs and updating Neighborhood with NA.

In [44]:
# filter out 'Not assigned' Boroughs.
valid_neighs = neigh_df.Borough != 'Not assigned'
neigh_df = neigh_df[valid_neighs]

na_neighborhoods = neigh_df.Neighborhood == 'Not assigned'
neigh_df.loc[na_neighborhoods, ['Neighborhood']] = neigh_df[na_neighborhoods].Borough
print('4. Neighborhoods dataframe filtered out.')

4. Neighborhoods dataframe filtered out.


In [45]:
neigh_df_grouped = neigh_df.groupby(['PostCode', 'Borough'])['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
neigh_df_grouped = neigh_df_grouped.to_frame()
neigh_df_grouped.reset_index(['PostCode', 'Borough'], inplace=True)
neigh_df_grouped.to_csv('list_of_postal_codes_of_canada.csv', index=False)

## 5. Printing number of rows and columns of neighborhood data frame.

In [46]:
print("The dimensions of neighborhood dataframe are : ", neigh_df_grouped.shape)

The dimensions of neighborhood dataframe are :  (103, 3)
