# Clustering neighbourhoods in Toronto
## Part I

#### The objective of this notebook is to scrape the data from given webpage to a pandas dataframe.

In [1]:
# Importing required libraries
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [2]:
# Getting webpage using requests and parsing using Beautiful Soup
page_URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M";
wikipedia_page = requests.get(page_URL);
soup = BeautifulSoup(wikipedia_page.text, 'html.parser')

In [3]:
# Extract data from table
table_element = soup.find('table')

# Save data to a list
data = []

# Each data entry is in a <tr> element
place_data = table_element.find_all('tr')[1:]            # Exluding table headers

for place in place_data:
    cell = place.find_all('td')
    
    
    # Get postal code
    postal_code = cell[0].text.rstrip('\n')

    # Get Borough
    borough = cell[1].text.rstrip('\n')

    # Get neighbourhoods
    neighbourhoods = cell[2].text.rstrip('\n')

    data.append((postal_code, borough, neighbourhoods))

In [4]:
df_toronto1 = pd.DataFrame(data=data, columns=['PostalCode', 'Borough', 'Neighborhood'])
df_toronto1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [5]:
# Many cells contain Not assigned as entry for Borough
toronto_df2 = df_toronto1[df_toronto1.Borough != "Not assigned"].reset_index(drop=True)
toronto_df2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
# Grouping neighbourhoods in same burough
toronto_df_grouped = toronto_df2.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
toronto_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
# For "Not assigned" neighbourhoods, make the value the same as Borough
for index, row in toronto_df_grouped.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]

# Reset index
toronto_df_grouped.reset_index(drop = True, inplace = True)
toronto_df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
print(f'The shape of datafame is: {toronto_df_grouped.shape}')

The shape of datafame is: (103, 3)


In [9]:
# Save dataframe to a csv file
toronto_df_grouped.to_csv("./assets/Toronto/toronto_base.csv");