# Segmenting and Clustering Neighborhoods in Toronto

###### Perpared by Enrique Puente for Coursera's Applied Data Science Capston Project

## 1.0 Download and Import Libraries

In [9]:
# Import Librarires
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## 2.0 Data Download

### 2.1 Downloading Data from Wikipedia

The following cell uses Pandas' "read_html" method for pulling table from Wikipedia Site

In [10]:
# Scrapping data from Wikipedia
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = pd.read_html(url, header=0)[0]
df.dropna(axis=0, how='any', inplace=True)
df.reset_index(drop=True, inplace=True)
df.head(5)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


The shape of the dataframe is:

In [11]:
df.shape

(103, 3)

### 2.2 Downloading Coordinates data from Geocoder

Using Geocoder for getting coordinates for Postal Codes Dataframe (https://geocoder.readthedocs.io/api.html#installation)

Download and import Geocoder Package

In [1]:
# !conda install -c conda-forge geocoder --yes #install geocoder
# ! git clone https://github.com/DenisCarriere/geocoder
# import geocoder # import geocoder

In [41]:
# initialize your variable to None
# lat_lng_coords = None

# # create list with postal codes
# # postal_code =

# # loop until you get the coordinates
# while(lat_lng_coords is None):
#   g = geocoder.google('{}, Toronto, Ontario'.format(df['Neighborhood'][0]))
#   lat_lng_coords = g.latlng

# latitude = lat_lng_coords[0]
# longitude = lat_lng_coords[1]

KeyboardInterrupt: 

Since the geocoder package was not returning valid results. I will be using the attached csv file (https://cocl.us/Geospatial_data) for the latitude and longitude data.

In [45]:
latlong = pd.read_csv('https://cocl.us/Geospatial_data')

In [55]:
LatLongNeigh = pd.merge(left=df, right=latlong, how='left', left_on='Postal Code', right_on='Postal Code')
LatLongNeigh.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## 3.0 Data Exploration

### 3.1 Explore and cluster the neighborhoods in Toronto

How many unique Boroughs in the dataset:

In [64]:
len(LatLongNeigh['Borough'].unique())

10

In [76]:
# set number of clusters
kclusters = 10

toronto_grouped_clustering = LatLongNeigh.loc[:,('Latitude','Longitude')]

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 5, 0, 5, 9, 3, 4, 1, 5], dtype=int32)

### 3.2 Maps use for visualizing neighborhoods and how they cluster together 

In [56]:
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, Ontario are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, Ontario are 43.6534817, -79.3839347.


In [62]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(LatLongNeigh['Latitude'], LatLongNeigh['Longitude'], LatLongNeigh['Borough'], LatLongNeigh['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [74]:
LatLongNeigh.loc[:,('Latitude','Longitude')]

Unnamed: 0,Latitude,Longitude
0,43.753259,-79.329656
1,43.725882,-79.315572
2,43.65426,-79.360636
3,43.718518,-79.464763
4,43.662301,-79.389494
5,43.667856,-79.532242
6,43.806686,-79.194353
7,43.745906,-79.352188
8,43.706397,-79.309937
9,43.657162,-79.378937
