# Segmenting and Clustering Neighborhoods in Toronto


### The Below portion will load all the required libraries

In [45]:
import numpy as np  
import pandas as pd  
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json  
import csv   

#!conda install -c conda-forge geopy --yes # uncomment this line if  the package is not installed
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#!conda install -c conda-forge requests --yes # uncomment this line if  the package is not installed
import requests # library to handle requests

from pandas.io.json import json_normalize # to tranform JSON file into a pandas dataframe

import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if  the package is not installed
import folium # map rendering library

#BeautifulSoup for Scraping Web
#!conda install -c conda-forge beautifulsoup4 --yes # uncomment this line if  the package is not installed
from bs4 import BeautifulSoup
#!conda install -c conda-forge lxml --yes # uncomment this line if  the package is not installed
import lxml

print('Required Libraries are imported......')


Required Libraries are imported......


### Postal Code Web Page is read and assigned to datafame

In [46]:
#Loading and reading the web page and collecting th table data (Postal Codes table is part of the web page)
# will search for html tag <tbody>, <tr> ......
sourceHTMLPage = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
html_data = BeautifulSoup(sourceHTMLPage.content, 'lxml')

table = html_data.find('tbody')
rows = table.select('tr')
row = [r.get_text() for r in rows]

# Assigning the postal code data to data frame 
df = pd.DataFrame(row)
df1 = df[0].str.split('\n', expand=True)
df2 = df1.rename(columns=df1.iloc[0])
df3 = df2.drop(df2.index[0])
df3.head()


Unnamed: 0,Unnamed: 1,Postal code,Unnamed: 3,Borough,Unnamed: 5,Neighborhood,Unnamed: 7
1,,M1A,,Not assigned,,,
2,,M2A,,Not assigned,,,
3,,M3A,,North York,,Parkwoods,
4,,M4A,,North York,,Victoria Village,
5,,M5A,,Downtown Toronto,,Regent Park / Harbourfront,


### Cleaning the Postal Code Dataframe - Eg:- remove the rows with 'borough' field is empty... etc...

In [47]:
# Dropping rows where Borough field is empty.
df4 = df3[df3.Borough != 'Not assigned']
df4.head()

# Groupe all rows with same Postcode
df5 = df4.groupby(['Postal code', 'Borough'], sort = False).agg(','.join)
df5.reset_index(inplace = True)
df5.head()

#Replacing Neighbourhood value with Borough's name where it isn't defined.
df5['Neighborhood'] = np.where(df5['Neighborhood'] == 'Not assigned', df5['Borough'], df5['Neighborhood'])
df5.head()


Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [55]:
df7.shape

(103, 5)

### Load the geospatial data and assign it to 'postal code dataframe' by matching the same

In [56]:
# To load and read Geopspatial data.
url = "http://cocl.us/Geospatial_data"
df6 = pd.read_csv(url)

# to make the column name same as the column name from the postal data.
df6.rename(columns={'Postal Code': 'Postal code'}, inplace=True)

#Merge the two DFs (Postal DF and Geospatial DF)
df7 = pd.merge(df5, df6, on='Postal code')
df7.head()


Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494


### Filter the merged dataframe for Toronto data

In [57]:
Toronto_DF = df7[df7['Borough'].str.contains('Toronto')]
Toronto_DF.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


### Graph of the clustered neighborhood in TORONTO

In [58]:
address = 'Toronto'
geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

Toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(Toronto_DF['Latitude'], Toronto_DF['Longitude'], 
                                           Toronto_DF['Borough'], Toronto_DF['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(Toronto_map)  
    
Toronto_map