# Canada Clustering analysis (question 3)

In [2]:
# importing basic packages
import pandas as pd
import numpy as np

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

#!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library


print('Libraries imported.')

Libraries imported.


### Step 1: Data Acquisition via WebScraping
Scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe

In [3]:

link = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

df_raw = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', 
                     header = 0)[0]
df_raw.head(4)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village


### step 2: Preprocessing

In [4]:
# Only process the cells that have an assigned borough. 
# Ignore cells with a borough that is Not assigned.
df_ca = df_raw[df_raw["Borough"] != "Not assigned" ]
df_ca.head(11)

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [5]:
# check if there are cells with a borough but a Not assigned neighborhood
df_ca_check = df_ca[df_ca["Neighborhood"] == "Not assigned" ]
df_ca_check

Unnamed: 0,Postal Code,Borough,Neighborhood


In [6]:
# print the number of rows of your dataframe.
df_ca.shape

(103, 3)

### step 3: Feature transformation
in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood.
We build a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name

### NOTE: geocoder packaged did not work on 20200529. Luckily, we have a pre-filled csv to use

In [7]:
# other method with IBM-provided file
df_geo_raw = pd.read_csv("./Data/Geospatial_Coordinates.csv")
print(df_geo_raw.shape)
df_geo_raw.head(3)

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711


In [8]:
df_geo = pd.merge(df_ca, df_geo_raw, on=['Postal Code', 'Postal Code'])
df_geo.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


## Step 4 DatViz

In [9]:
# create map of Toronto using latitude and longitude values
map_TO = folium.Map(location=[43.739514, -79.363101], zoom_start=10)
neighborhoods = df_geo.copy()

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_TO)  
    
map_TO