## Importing Libraries

In [2]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

from bs4 import BeautifulSoup #html parser

import requests

print('Libraries imported.')

Libraries imported.


## Getting the HTML file via BeautifulSoup

In [3]:
# Getting the html file
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source,'lxml')


## Retrieving the table from my HTML File

In [4]:
# Getting the table from my html file
table = soup.find('table', class_ = 'wikitable sortable')
table_rows = table.find_all('tr')


## Creating my dataframe from my html table

In [5]:
# Creating my dataframe from my html table
my_list = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    my_list.append(row)
df = pd.DataFrame(my_list, columns=["Postcode", "Borough", "Neighbourhood"])

## Deleting the 1st row from my df

In [6]:
new_df = df.drop(0)

## Deleting the '\n' from last column

In [7]:
new_df['Neighbourhood'].replace(regex=True,inplace=True,to_replace=r'\n',value=r'')

## Replacing the "not assigned" value in Borough column with N/A in order to delete it

In [8]:
new_df['Borough'] = new_df['Borough'].replace('Not assigned', np.nan)

## Deleting all rows which have N/A value in Borough column

In [9]:
new_df = new_df.dropna()

## Replacing 'Not assigned' values in Neighborhood column with the same value in the equvivalent Borough column

In [10]:
new_df['Neighbourhood'] = new_df['Neighbourhood'].replace('Not assigned', new_df['Borough'])

## Merging rows which have the same value in Borough column

In [11]:
final_df = new_df.groupby(['Postcode','Borough'], as_index=False).agg(', '.join)

## Using the .shape method to print the number of rows of your dataframe

In [12]:
final_df.shape

(103, 3)

In [13]:
final_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


# 2ND PART OF THE ASSIGNMENT: ADDING COORDINATES TO THE DATAFRAME

## Creating a dataframe with the Geospatial_Coordinates

In [14]:
my_cor = pd.read_csv('Geospatial_Coordinates.csv')

In [15]:
my_cor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Renaming the 'Postal Code' column to 'Postcode' in order to achieve the merging of the final_df and my_cor dataframes

In [16]:
my_cor = my_cor.rename(columns = {'Postal Code' : 'Postcode' })

## Merging my final_df with the my_cor dataframe to create the desired result

In [17]:
complete_df = pd.merge(final_df, my_cor, on = 'Postcode' )

## Our complete dataframe with postcode, borough, neighbourhood and their equivalent coordinates

In [18]:
complete_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# 3ND PART OF THE ASSIGNMENT: Clustering

## Let's see how many unique Boroughs we have

In [54]:
complete_df['Borough'].nunique()

11

## Let's see on the map all the neighborhoods in Toronto

In [55]:
address = 'Toronto, TO'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinates of Toronto are 43.638093, -79.4665843.


## Function that is saving the folium map in HTML so that JupyterLab can load it

In [82]:
def embed_map1(m):
    from IPython.display import IFrame

    m.save('index1.html')
    return IFrame('index1.html', width='100%', height='750px')

In [83]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, postcode, neighborhood in zip(complete_df['Latitude'], complete_df['Longitude'],  complete_df['Postcode'], complete_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, postcode)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# MAP DOESN'T DISPLAY on github. Check Pic1.jpg for map visualization

In [85]:
embed_map1(map_toronto)

## Lets cluster those neighborhoods depending on which Borough they belong

In [86]:
# one hot encoding
toronto_onehot = pd.get_dummies(complete_df[['Borough']], prefix="", prefix_sep="")

# add Borough column back to dataframe
toronto_onehot['Borough'] = complete_df['Borough'] 

# move Borough column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()


Unnamed: 0,Borough,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Queen's Park,Scarborough,West Toronto,York
0,Scarborough,0,0,0,0,0,0,0,0,1,0,0
1,Scarborough,0,0,0,0,0,0,0,0,1,0,0
2,Scarborough,0,0,0,0,0,0,0,0,1,0,0
3,Scarborough,0,0,0,0,0,0,0,0,1,0,0
4,Scarborough,0,0,0,0,0,0,0,0,1,0,0


In [87]:
toronto_grouped = toronto_onehot.groupby('Borough').mean().reset_index()
toronto_grouped

Unnamed: 0,Borough,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Queen's Park,Scarborough,West Toronto,York
0,Central Toronto,1,0,0,0,0,0,0,0,0,0,0
1,Downtown Toronto,0,1,0,0,0,0,0,0,0,0,0
2,East Toronto,0,0,1,0,0,0,0,0,0,0,0
3,East York,0,0,0,1,0,0,0,0,0,0,0
4,Etobicoke,0,0,0,0,1,0,0,0,0,0,0
5,Mississauga,0,0,0,0,0,1,0,0,0,0,0
6,North York,0,0,0,0,0,0,1,0,0,0,0
7,Queen's Park,0,0,0,0,0,0,0,1,0,0,0
8,Scarborough,0,0,0,0,0,0,0,0,1,0,0
9,West Toronto,0,0,0,0,0,0,0,0,0,1,0


## Since we have 11 different boroughs, we will create 11 clusters of neighborhoods

In [88]:
kclusters = 11
toronto_grouped_clustering = toronto_grouped.drop('Borough', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 


array([ 9, 10,  7,  2,  3,  0,  8,  6,  4,  1])

In [89]:
toronto_merged = toronto_grouped
# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(complete_df.set_index('Borough'), on='Borough')

toronto_merged.head() # check the last columns!



Unnamed: 0,Borough,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Queen's Park,Scarborough,West Toronto,York,Cluster Labels,Postcode,Neighbourhood,Latitude,Longitude
0,Central Toronto,1,0,0,0,0,0,0,0,0,0,0,9,M4N,Lawrence Park,43.72802,-79.38879
0,Central Toronto,1,0,0,0,0,0,0,0,0,0,0,9,M4P,Davisville North,43.712751,-79.390197
0,Central Toronto,1,0,0,0,0,0,0,0,0,0,0,9,M4R,North Toronto West,43.715383,-79.405678
0,Central Toronto,1,0,0,0,0,0,0,0,0,0,0,9,M4S,Davisville,43.704324,-79.38879
0,Central Toronto,1,0,0,0,0,0,0,0,0,0,0,9,M4T,"Moore Park, Summerhill East",43.689574,-79.38316


In [90]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, bor, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Borough'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(bor) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# MAP DOESN'T DISPLAY on github. Check Pic2.jpg for clustering visualization

In [91]:
def embed_map2(m):
    from IPython.display import IFrame

    m.save('index2.html')
    return IFrame('index2.html', width='100%', height='750px')

In [92]:
embed_map2(map_clusters)