# Exploring and Clustering Toronto Neighborhoods

In this notebook we will explore and cluster Toronto neighborhoods

## Contents
1. Reading Data into DataFrame
1. Retrieving the table
1. Cleaning Data

### Reading Data into DataFrame

In [1]:
import pandas as pd
import numpy as np

# Set the url where the data is located
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# read all the tables from the 
tables = pd.read_html(url)


### Retrieving the table

The Data is contained in the first table therefore we ignore the other tables

In [40]:
# we only need the data from the first table
toronto_neighborhoods_df = tables[0]
toronto_neighborhoods_df = toronto_neighborhoods_df[toronto_neighborhoods_df.Borough != 'Not assigned']
toronto_neighborhoods_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Data Wrangling

In [70]:
# The data we are looking for has unique Postcodes so first we select those and keep them in a numpy array
# Then we create lists which will contain the data corresponding to the unique Postcodes
filtered_postcodes_df = toronto_neighborhoods_df['Postcode'].unique()
neighbourhood = []
borough = []

# basically filling the lists with data that corresponds to the Postcodes unique values
for neigh in filtered_postcodes_df:
    temp_df = toronto_neighborhoods_df[toronto_neighborhoods_df['Postcode'] == neigh]
    neighbourhoods = ', '.join(temp_df.Neighbourhood)
    neighbourhood.append(neighbourhoods)
    borough.append(''.join(temp_df.Borough.unique().tolist()))
    
    
# Creating the dataframe that we want from the lists that was created
neigh_na = toronto_neighborhoods_df.drop(['Neighbourhood'], axis=1)
merged_toronto_neighbourhood_df = pd.DataFrame(filtered_postcodes_df, columns=['Postcode']).join(pd.DataFrame(borough, columns=['Borough'])).join(pd.DataFrame(neighbourhood, columns=['Neighbourhood']))


# Replacing the "Not assigned" values in the Neighbourhood with the values contained in the Borough column
for i, neigh, bor in zip(merged_toronto_neighbourhood_df.index,  merged_toronto_neighbourhood_df['Neighbourhood'], merged_toronto_neighbourhood_df['Borough']):
    if neigh == 'Not assigned':
        merged_toronto_neighbourhood_df.loc[i, ['Neighbourhood']] = bor

# Preview the desired DataFrame  
merged_toronto_neighbourhood_df.head(10)
    

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


### Desired DataFrame Shape

In [71]:
merged_toronto_neighbourhood_df.shape

(103, 3)