# Clustering Neighborhoods in Toronto
---

## *Gathering and Processing Neighborhood Data*

#### Import all the libraries that might be needed

In [96]:
import numpy as np

import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 

import requests
from pandas.io.json import json_normalize 

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Libraries imported.


#### Convert the table on the Wikipedia page into a pandas dataframe

In [108]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

df = pd.read_html(url)
df = df[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Remove all the Boroughs that are "Not assigned" from the dataframe

In [109]:
df.set_index('Borough', inplace=True)
df.head()

Unnamed: 0_level_0,Postal Code,Neighbourhood
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1
Not assigned,M1A,Not assigned
Not assigned,M2A,Not assigned
North York,M3A,Parkwoods
North York,M4A,Victoria Village
Downtown Toronto,M5A,"Regent Park, Harbourfront"


In [110]:
df.drop(index='Not assigned', inplace=True)
df.reset_index(inplace=True)
df.head()

Unnamed: 0,Borough,Postal Code,Neighbourhood
0,North York,M3A,Parkwoods
1,North York,M4A,Victoria Village
2,Downtown Toronto,M5A,"Regent Park, Harbourfront"
3,North York,M6A,"Lawrence Manor, Lawrence Heights"
4,Downtown Toronto,M7A,"Queen's Park, Ontario Provincial Government"


#### Check to make sure there are no Neighbourhoods left that are "Not assigned"

In [111]:
df.sort_values('Neighbourhood')

Unnamed: 0,Borough,Postal Code,Neighbourhood
78,Scarborough,M1S,Agincourt
93,Etobicoke,M8W,"Alderwood, Long Branch"
28,North York,M3H,"Bathurst Manor, Wilson Heights, Downsview North"
39,North York,M2K,Bayview Village
55,North York,M5M,"Bedford Park, Lawrence Manor East"
20,Downtown Toronto,M5E,Berczy Park
58,Scarborough,M1N,"Birch Cliff, Cliffside West"
43,West Toronto,M6K,"Brockton, Parkdale Village, Exhibition Place"
100,East Toronto,M7Y,"Business reply mail Processing Centre, South C..."
87,Downtown Toronto,M5V,"CN Tower, King and Spadina, Railway Lands, Har..."


#### Reaarrange and Rename the columns to match the example given

In [112]:
df = df[['Postal Code', 'Borough', 'Neighbourhood']]
df.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Note the shape of the dataframe: 103 Rows, 3 Columns

In [113]:
df.shape

(103, 3)

## *Gathering Coordinates of the Postal Codes*

#### Try using geocoder

In [103]:
import geocoder

lat_lng_coords = None

while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

ModuleNotFoundError: No module named 'geocoder'

#### No 'geocoder' found, so will use the CSV file to import latitude and longitude data

In [114]:
df_latlong = pd.read_csv('https://cocl.us/Geospatial_data')
df_latlong.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Now, will affix Latitude and Longitude columns to the New York Neighborhood df made above

In [115]:
df = df_latlong.join(df.set_index('PostalCode'), on='Postal Code')
df.head()

Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighbourhood
0,M1B,43.806686,-79.194353,Scarborough,"Malvern, Rouge"
1,M1C,43.784535,-79.160497,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


In [116]:
df.rename(columns={'Neighbourhood':'Neighborhood', 'Postal Code': 'PostalCode'}, inplace=True)
df.head()

Unnamed: 0,PostalCode,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,"Malvern, Rouge"
1,M1C,43.784535,-79.160497,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


In [117]:
df = df[['PostalCode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude']]
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
