# Segmenting and Clustering Neighborhoods in Toronto

First step is to instal and import the required Libraries.

In [1]:
# Import all neccesary libraries
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import matplotlib.pyplot as plt # library for visualisation
import random # library for random number
from bs4 import BeautifulSoup # library for scrapping data

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 

from IPython.display import display_html
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!pip install -U imbalanced-learn
!pip install -U scipy
!pip install -U imbalanced-learn
!pip install lxml
!pip install folium
import folium # plotting library

from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Libraries sucessfully imported.')

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Requirement already up-to-date: imbalanced-learn in c:\programdata\anaconda3\lib\site-packages (0.8.0)
Requirement already up-to-date: scipy in c:\programdata\anaconda3\lib\site-packages (1.6.3)
Requirement already up-to-date: imbalanced-learn in c:\programdata\anaconda3\lib\site-packages (0.8.0)
Libraries sucessfully imported.


## PART 1

### Scrape the Toronto Neighborhood Information using the BeautifulSoup and requests libraries

In [2]:
url = 'https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1011037969'
website = requests.get(url).text
soup = BeautifulSoup(website, 'html.parser')

table_contents=[]
table = soup.find('table').find('tbody').find_all('tr')

for row in table[1:]:
    cell = {}
    table_data = row.find_all('td')
    if row.text == 'Not assigned':
        pass
    else:
        cell['PostalCode'] = table_data[0].text.split('\n')[0]
        cell['Borough'] = table_data[1].text.split('\n')[0]
        cell['Neighborhood'] = table_data[2].text.split('\n')[0]
        table_contents.append(cell)
        

### Import the scrapped data into pandas as a DataFrame

In [3]:
toronto_df = pd.DataFrame(table_contents)
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


# PART 2

### The next step is to clean the extracted dataset before going ahead to fetch the Geospatial data

In [4]:
# Dropping the rows where Borough or Neighborhood is 'Not assigned'
toronto_df = toronto_df[(toronto_df['Borough'] != 'Not assigned')| (toronto_df['Neighborhood'] != 'Not assigned')]
print(toronto_df.shape)
toronto_df.head()

(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


The 'Not assigned' values has been taken care of and the record of data has now dropped from 180 to 103

In [5]:
# To confirm that each postal code values are unique
toronto_df['PostalCode'].nunique() #  the total counts of the unique values should be equal to 103

103

### <justify> Given that this Geospatial package can be very unreliable, it will be tried for obtaining the Geo location of the Neigborhood address on the dataframe but the alternative means (getting the data from the GeoSpatial Dataset) will be used</justify>

In [6]:
try:
    location_data = []
    for name in toronto_df['Neighborhood']:
        geo_local = {}
        geolocator = Nominatim(timeout=10, user_agent = "myGeolocator")
        location = geolocator.geocode(name)
        geo_local['Latitude'] = location.latitude
        geo_local['Longitude'] = location.longitude
        location_data.append(geo_local)

    # Adding the generated data to Toronto dataframe
    toronto_df.assign(column_name = location_data)
except:
    #Importing the csv file conatining the latitudes and longitudes for various neighbourhoods in Canada
    geo_df = pd.read_csv('https://cocl.us/Geospatial_data')
    geo_df.head()
    # Merging the  toronto_df and geo_df to get the Latitudes and Longitudes for various neighbourhoods in Canada
    geo_df.rename(columns={'Postal Code':'PostalCode'}, inplace=True) # Renaming the Postal code to match the dataframe to merge
    toronto_df = pd.merge(toronto_df, geo_df, on='PostalCode')

    
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# PART 3

### To explore and cluster the neighborhoods in Toronto, only Boroughs that contain the word Toronto will be used and will be replicated for the New York City data

In [10]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_df['Borough'].unique()),
        toronto_df.shape[0]))

The dataframe has 11 boroughs and 103 neighborhoods.


#### Spliting data into two clusters  namely <i>toronto</i> and <i>newyork</i>

In [11]:
toronto = toronto_df[toronto_df['Borough'].str.contains('Toronto', regex=False)]
newyork = toronto_df[toronto_df['Borough'].str.contains('North York', regex=False)]

### Getting the the logitude and latitude for New York

In [19]:
address = 'New York'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
Latitude = location.latitude
Longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(Latitude, Longitude))

The geograpical coordinate of Manhattan are 40.7127281, -74.0060152.


In [14]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[Latitude, Longitude], zoom_start=10)

# add markers to map
for lat, lng, label in zip(newyork['Latitude'], newyork['Longitude'], newyork['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

### Getting the the logitude and latitude for Toronto

In [15]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Manhattan are 43.6534817, -79.3839347.


In [16]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(toronto['Latitude'], toronto['Longitude'], toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [17]:
k = 4
toronto_clustering = toronto.drop(['PostalCode','Borough','Neighborhood'], axis= 1)
kmeans = KMeans(n_clusters = k, random_state=0).fit(toronto_clustering)
kmeans.labels_
toronto.insert(0, 'Cluster Label', kmeans.labels_)

In [18]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label, cluster in zip(toronto['Latitude'], toronto['Longitude'], toronto['Neighborhood'], toronto['Cluster Label']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto