# Segmenting And Clustering Neighborhoods in Toronto

Notebook for Toronto neighborhoods assignment in the capstone course of the IBM data science certificate.

## Part 1 - Preparing the neighborhood dataframe

In [1]:
# Our needed imports.
import folium
import ibm_boto3
import json
import numpy as np
import pandas as pd
import requests
import types
from botocore.client import Config
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans

In [2]:
# Prepare our neighborhood dataframe.
neighborhood_df = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood'])

In [3]:
# Load our Wikipedia page containing postal codes in Toronto.
crawl_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(crawl_url)

In [4]:
# Loop over table and add rows to our dataframe.
soup = BeautifulSoup(r.text, 'html.parser')
postal_codes_table = soup.find('table', {'class': 'wikitable'})
postal_codes_rows = postal_codes_table.find_all('tr')
for count, postal_code in enumerate(postal_codes_rows):
    # First row is header, skip.
    if count == 0:
        continue

    # Prepare our data.
    cells = postal_code.find_all('td')
    postal = cells[0].text.strip()
    borough = cells[1].text.strip()
    neighborhood = cells[2].text.strip()

    # Skip any postal code without an assigned borough.
    if borough == 'Not assigned':
        continue
        
    neighborhood_df = neighborhood_df.append(pd.Series([postal, borough, neighborhood], index=neighborhood_df.columns), ignore_index=True)

In [5]:
# Preview our neighborhood dataframe.
neighborhood_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
# See our dataframe's shape.
print('Shape:', neighborhood_df.shape)

Shape: (103, 3)


## Part 2 - Adding latitude and longitude

Load in the CSV data from IBM's storage into notebook.

In [7]:
# The code was removed by Watson Studio for sharing.

In [8]:
# Turn CSV data into a DatFrame.
geospatial_df = pd.read_csv(geospatial_csv).rename(columns={'Postal Code': 'PostalCode'})

In [9]:
# Merge geospatial data into neighborhood data.
neighborhood_df = neighborhood_df.merge(geospatial_df)

In [10]:
# Preview enhanced DataFrame.
neighborhood_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
