# Segmenting and Clustering Neighborhoods in Toronto

## Part1-Data preprocessing

In [3]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy.geocoders import Nominatim
#import folium
from pandas.io.json import json_normalize

print("Libraries have been imported succesfully!")

Libraries have been imported succesfully!


In [4]:
webPage = requests.get("https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&direction=next&oldid=942655364") #Download page
html = webPage.text                                                                       #Get HTML code
tableInit = html.find('<table class="wikitable sortable">')                               #Locate index for beginning of the table
tableFinal = html.find('</table>')                                                        #Locate index for ending of the table
htmlTable = html[tableInit:tableFinal]                                                    #HTML table extracted
table = pd.read_html(htmlTable, header = 0)[0]                                            #From HTML to Pandas data frame

print("Table has been downloaded succesfully!")
table.head()

Table has been downloaded succesfully!


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [5]:
table["Borough"] = table["Borough"].replace({"Not assigned":np.nan})                  #Convert "Not assigned" from Borough to NaN
table.dropna(inplace = True)                                                          #Remove all rows that have NaN 
table.where(table != "Not assigned", table["Borough"], axis = 0, inplace = True)      #Convert "Not assigned" from Neighbourhood to Borough
joinedRows = table.groupby("Postcode")["Neighbourhood"].apply(lambda x: ", ".join(x)) #Join rows with the same "Postcode" with a comma between
table.drop_duplicates(["Postcode"],inplace = True)                                    #Remove duplicates so that the joined rows and table have the same shape
df = table.join(joinedRows, on = "Postcode", lsuffix='_single')                       #Join the new row and the table
df.drop(columns = ["Neighbourhood_single"], inplace = True)                           #Drop the column of neighbourhood that is not usefull anymore
df.reset_index(drop = True, inplace = True)                                           #Reset index

print("Data preprocessed!. Shape of table: "+ str(df.shape))
df.head(10)

Data preprocessed!. Shape of table: (103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


## Part 2- Request latitude and longitude from neighbourhoods

In [6]:
!wget -q -O 'Geospatial_Coordinates.csv' http://cocl.us/Geospatial_data #geocoder didn't work so I downloaded the .csv file
geo = pd.read_csv("Geospatial_Coordinates.csv", index_col = 0)          #Read csv file
df_final = df.join(geo, on = "Postcode")                                #Join dataframe 
df_final.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


## Part 3- Segmentation and Analysis

#### Geopy library to get the latitude and longitude values of Toronto.

In [7]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium
print("folium installed")

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    certifi-2020.6.20          |   py36h9f0ad1d_0         151 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                       

In [8]:
address = 'Toronto'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  from ipykernel import kernelapp as app


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Create a map of New York with neighborhoods superimposed on top

In [9]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_final['Latitude'], df_final['Longitude'], df_final['Borough'], df_final['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [10]:
xToronto_data = df_final[df_final['Borough'].str.contains('Toronto')].reset_index(drop=True)
xToronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
2,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [11]:
# create map of  xToronto using latitude and longitude values
map_xtoronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(xToronto_data['Latitude'], xToronto_data['Longitude'], xToronto_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_xtoronto)  
    
map_xtoronto