# Segmenting and Clustering Neighborhoods in Toronto

## Part 1: Create Dataframe of Toronto Neighborhoods

In [1]:
# Import necessary modules.

import numpy as np
import pandas as pd
import json 
import requests

In [None]:
# Install modules for parsing webpage.
!pip install beautifulsoup4
!pip install lxml

In [3]:
import bs4 as BeautifulSoup

Transfer data from table on Wikipedia page into a dataframe. 

In [5]:
# Create a parse tree of wikipedia page on Toronto postal codes.
postal_page = requests.get('http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
postal_soup = BeautifulSoup.BeautifulSoup(postal_page,'html.parser')

# Extract the table
table_soup = postal_soup.find('table')

# Transfer the table data into a pandas dataframe.
table_data = []

for row in table_soup.find_all('tr'):
    row_data = []
    
    for header in row.find_all('th'):
        row_data.append(header.text.replace('\n','').strip())
    
    for datum in row.find_all('td'):
        row_data.append(datum.text.replace('\n','').strip())

    table_data.append(row_data)     

headers = table_data[0]
data = table_data[1:-1]

postal_df = pd.DataFrame(data,columns = headers)
postal_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Clean up the dataframe.

In [6]:
# Check the 'Borough' feature for possible null values
postal_df['Borough'].unique()

array(['Not assigned', 'North York', 'Downtown Toronto', 'Etobicoke',
       'Scarborough', 'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [7]:
# Drop rows without assigned boroughs. 
postal_df = postal_df   [postal_df['Borough'] != 'Not assigned']
postal_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
# Reset the index. Check that the number of rows is same as number of unique postal codes, i.e., there are no duplicates.
postal_df.reset_index(drop = True, inplace = True)
print (postal_df.shape)
print (len(postal_df['Postal Code'].unique()))

(103, 3)
103


In [9]:
# Check that there are no unassigned neighborhoods.

print (postal_df['Neighborhood'].unique())
print ('Not Assigned' in postal_df['Neighborhood'])
print (np.nan in postal_df['Neighborhood'])

['Parkwoods' 'Victoria Village' 'Regent Park, Harbourfront'
 'Lawrence Manor, Lawrence Heights'
 "Queen's Park, Ontario Provincial Government" 'Islington Avenue'
 'Malvern, Rouge' 'Don Mills' 'Parkview Hill, Woodbine Gardens'
 'Garden District, Ryerson' 'Glencairn'
 'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale'
 'Rouge Hill, Port Union, Highland Creek' 'Woodbine Heights'
 'St. James Town' 'Humewood-Cedarvale'
 'Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood'
 'Guildwood, Morningside, West Hill' 'The Beaches' 'Berczy Park'
 'Caledonia-Fairbanks' 'Woburn' 'Leaside' 'Central Bay Street' 'Christie'
 'Cedarbrae' 'Hillcrest Village'
 'Bathurst Manor, Wilson Heights, Downsview North' 'Thorncliffe Park'
 'Richmond, Adelaide, King' 'Dufferin, Dovercourt Village'
 'Scarborough Village' 'Fairview, Henry Farm, Oriole'
 'Northwood Park, York University' 'East Toronto'
 'Harbourfront East, Union Station, Toronto Islands'
 'Little Portugal, Trinity' 'Kennedy 

In [10]:
# Write the cleaned datagrame to csv.
postal_df.to_csv(path_or_buf = 'toronto_postal_data.csv',index = False,)

In [11]:
postal_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
