# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
#import libraries

import pandas as pd
import numpy as np
import json
from geopy.geocoders import Nominatim, ArcGIS
import requests 
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

print('All libraries loaded!')

All libraries loaded!


### Getting the data from Wikipedia

    BeautifulSoup is the tool to go if we want get the information needed as the table html format does not allow Pandas to read it straight with pd.read_html()
    The main steps are
    
    1. Send a GET request to get the HTML code 
    2. Create a BeautifulSoup instance to simplify the extraction 
    3. Get the relevant table from the HTML body code
    4. Get the information inside the rows
    5. Create a dataframe with the gathered information
    6. Preprocess the dataframe to remove unrelevant data

In [2]:
from bs4 import BeautifulSoup
import requests

# Getting the HTML data
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_content = requests.get(url).text

soup = BeautifulSoup(html_content, 'lxml')
tables = soup.find_all("table")

data_table = tables[0] # The first table is the one containing the info we want

In [3]:
rows = data_table.find_all('tr')[1:]

columns = ['PostalCode','Borough', 'Neighborhood']
l = []

#Adding the content of each table row to a list
for tr in rows:
    td = tr.find_all('td')
    row = [tr.text.rstrip() for tr in td]
    l.append(row)

toronto_df = pd.DataFrame(l, columns=columns)
toronto_df['Neighborhood'].replace('',value= np.nan, inplace=True) # Replace empty strings with NaN
toronto_df.dropna(inplace=True) # Drop nan rows 
toronto_df = toronto_df.reset_index(drop=True) # Reseting index

In [4]:
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
toronto_df.shape

(103, 3)

### Adding Location Data

In [8]:
user_agent = "toronto_n"
geolocator = ArcGIS(user_agent=user_agent) #The ArcGIS geolocator doesn't need any credentials like Google's


def get_ll(postal_code):
    
    address = f'{postal_code}, Toronto, Ontario'
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    return latitude, longitude

In [9]:
toronto_df['location'] = toronto_df['PostalCode'].apply(lambda x: get_ll(x))

In [11]:
def lat(location):
    return location[0]
    
def long(location):
    return location[1]
    
toronto_df['Latitude'] = toronto_df['location'].apply(lambda x: lat(x))
toronto_df['Longitude'] = toronto_df['location'].apply(lambda x: long(x))

toronto_df.drop('location', inplace=True, axis=1)

In [12]:
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.752935,-79.335641
1,M4A,North York,Victoria Village,43.728102,-79.31189
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939
