# Segmenting and Clustering Neighborhoods in Toronto

## 1. Create a DataFrame with PostalCode, Borough, and Neighborhood

In [1]:
#Import libraries
import pandas as pd
import numpy as np

### Scrape web for Toronto postal codes

In [2]:
#Use pandas to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M (postal codes for Toronto)
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
scrape_of_postal_codes = pd.read_html(url)
scrape_of_postal_codes


[    Postal Code           Borough  \
 0           M1A      Not assigned   
 1           M2A      Not assigned   
 2           M3A        North York   
 3           M4A        North York   
 4           M5A  Downtown Toronto   
 5           M6A        North York   
 6           M7A  Downtown Toronto   
 7           M8A      Not assigned   
 8           M9A         Etobicoke   
 9           M1B       Scarborough   
 10          M2B      Not assigned   
 11          M3B        North York   
 12          M4B         East York   
 13          M5B  Downtown Toronto   
 14          M6B        North York   
 15          M7B      Not assigned   
 16          M8B      Not assigned   
 17          M9B         Etobicoke   
 18          M1C       Scarborough   
 19          M2C      Not assigned   
 20          M3C        North York   
 21          M4C         East York   
 22          M5C  Downtown Toronto   
 23          M6C              York   
 24          M7C      Not assigned   
 25         

In [3]:
#Check number of tables in scrape
len(scrape_of_postal_codes)

3

In [4]:
#Get first table in scrape
postal_codes = scrape_of_postal_codes[0]

#Verify that postal_codes is a dataframe and show info about column data types 
postal_codes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
Postal Code      180 non-null object
Borough          180 non-null object
Neighbourhood    180 non-null object
dtypes: object(3)
memory usage: 4.3+ KB


In [5]:
#Show first 5 rows in dataframe
postal_codes.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Drop all rows of dataframe where burrough is unassigned (check that all neighbourhoods and burroughs are assigned)

In [6]:
import numpy as np

# replace "Not assigned" to NaN
postal_codes.replace("Not assigned", np.nan, inplace = True)
postal_codes.head()



Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [7]:
#Number of postal codes with unassigned burroughs or neighbourhoods
print('Number of postal codes with unassigned burrough', postal_codes['Borough'].isnull().sum())
print('Number of postal codes with unassigned neighbourhood', postal_codes['Neighbourhood'].isnull().sum())


Number of postal codes with unassigned burrough 77
Number of postal codes with unassigned neighbourhood 77


In [8]:
#Drop all rows where Borough is NaN and reset index
postal_codes.dropna(subset=["Borough"], axis=0, inplace=True)
postal_codes.reset_index(drop=True, inplace=True)


#Number of postal codes with unassigned burroughs or neighbourhoods
print('Number of postal codes with unassigned burrough', postal_codes['Borough'].isnull().sum())
print('Number of postal codes with unassigned neighbourhood', postal_codes['Neighbourhood'].isnull().sum())

postal_codes.head()

Number of postal codes with unassigned burrough 0
Number of postal codes with unassigned neighbourhood 0


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Display shape of postal_codes dataframe

In [9]:
postal_codes.shape

(103, 3)

## 2. Add latitude and longitude coordinates for each neighbourhood to dataframe

### Get lat and long cordinates for postal codes from http://cocl.us/Geospatial_data 

In [10]:
postal_codes_lat_long = pd.read_csv('http://cocl.us/Geospatial_data')
postal_codes_lat_long.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Create a dataframe, toronto_neighbourhoods, from postal_codes and postal_codes_lat_long (merge on 'Postal Code')

In [11]:
toronto_neighbourhoods = pd.merge(postal_codes, postal_codes_lat_long, on = 'Postal Code')
toronto_neighbourhoods.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
