# Import all libraries for this exercise

In [317]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
# import folium # map rendering library

print('Libraries imported.')

Libraries imported.


# Question 1: Create dataframe of Toronto neighbourhoods

## Scraping the Wikipedia page

In [435]:
# open the URL containing the dataset and get the html of the page.
from urllib.request import urlopen
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html = urlopen(url)

# create a Beautiful Soup object
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
type(soup)

bs4.BeautifulSoup

In [436]:
# look at the title of webpage
title = soup.title
print(title)

<title>List of postal codes of Canada: M - Wikipedia</title>


In [437]:
# The code below generates an empty list, extract text in between html tags for each row, and append it to the assigned list.
import re
list_rows = []
for row in rows:
    cells = row.find_all('td')
    str_cells = str(cells)
    clean = re.compile('<.*?>')
    clean2 = (re.sub(clean, '',str_cells))
    list_rows.append(clean2)
#print(clean2)
type(clean2)

str

In [438]:
# convert the list into a dataframe
df = pd.DataFrame(list_rows)
df.head(5)

Unnamed: 0,0
0,[]
1,"[M1A, Not assigned, Not assigned\n]"
2,"[M2A, Not assigned, Not assigned\n]"
3,"[M3A, North York, Parkwoods\n]"
4,"[M4A, North York, Victoria Village\n]"


## Processing and cleaning up the initial dataframe

In [439]:
# clean the data
df1 = df[0].str.split(',', expand=True)
df1[0] = df1[0].str.strip('[')
df1[2] = df1[2].str.strip('\n]')
df1.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
0,],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,M1A,Not assigned,Not assigned,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,M2A,Not assigned,Not assigned,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,M3A,North York,Parkwoods,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,M4A,North York,Victoria Village,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [442]:
# obtain and clean the column headers
col_labels = soup.find_all('th')
all_header = []
col_str = str(col_labels)
cleantext2 = BeautifulSoup(col_str, "lxml").get_text()
all_header.append(cleantext2)
print(all_header)

['[Postcode, Borough, Neighbourhood\n, Canadian postal codes\n]']


In [443]:
# further clean up of headers
df2 = pd.DataFrame(all_header)
df3 = df2[0].str.split(',', expand=True)
df3[0] = df3[0].str.strip('[')
df3[2] = df3[2].str.strip('\n')
df3.head()

Unnamed: 0,0,1,2,3
0,Postcode,Borough,Neighbourhood,Canadian postal codes\n]


In [444]:
# merge headers with the main table
frames = [df3, df1]
df4 = pd.concat(frames)

In [445]:
# put the column headings in place and get rid of the replicated first row
df5 = df4.rename(columns=df4.iloc[0])
df6 = df5.drop(df5.index[0])

In [446]:
# keep only the columns we need
df7 = df6[df6.columns[0:3]]
# strip leading and trailing spaces
df7.rename(columns={' Borough': 'Borough'}, inplace=True)
df7.rename(columns={' Neighbourhood': 'Neighbourhood'}, inplace=True)
df7['Postcode'] = df7['Postcode'].str.strip()
df7['Borough'] = df7['Borough'].str.strip()
df7['Neighbourhood'] = df7['Neighbourhood'].str.strip()
# Get rid of the rows if Borough not assigned
indexNames = df7[ df7['Borough'] == 'Not assigned' ].index
df7.drop(indexNames , inplace=True)
# Replace Neighbourhood name with Borough name if Neighbourhood name not assigned
df7.Neighbourhood.replace('Not assigned',df7.Borough,inplace=True)
df7.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Queen's Park
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


In [447]:
df7.tail(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
283,M8Z,Etobicoke,Kingsway Park South West
284,M8Z,Etobicoke,Mimico NW
285,M8Z,Etobicoke,The Queensway West
286,M8Z,Etobicoke,Royal York South West
287,M8Z,Etobicoke,South of Bloor
289,],,
290,,],
291,NL\n\nNS\n\nPE\n\nNB\n\nQC\n\nON\n\nMB\n\nSK\n...,NL,NS
292,NL,NS,PE
293,A,B,C


In [448]:
# drop the last 5 rows having bad data
df7.drop(df7.tail(5).index,inplace=True)
df7.shape

(211, 3)

## Grouping multiple neighbourhoods within a post code zone

In [449]:
# Group by post code and merge the neighbourhood names
df8 = df7.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join)
df8.shape

(103,)

In [450]:
# export to csv
df8.to_csv('toronto.csv', header=None, sep='?')

  from ipykernel import kernelapp as app


## The final dataframe

In [451]:
# import from csv
df9 = pd.read_csv('toronto.csv', header=None, sep='?')
df9.columns = ['Postcode', 'Borough', 'Neighbourhood']
df9.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [452]:
# get the shape of the final dataframe
df9.shape

(103, 3)

 # Question 2: Get the geographical coordinates of the neighborhoods 

In [453]:
# import geospatial data from csv
url_csv = 'https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv'
df10 = pd.read_csv(url_csv)
df10.rename(columns={'Postal Code':'Postcode'}, inplace=True)
df10.shape

(103, 3)

In [454]:
# merge the neighbourhood data with the geospatial data
df11 = pd.merge(df9, df10, on='Postcode', how='left')
df11.shape

(103, 5)

In [456]:
# check the first five rows of the merged dataframe
df11.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Question 3: Explore and cluster the neighborhoods in Toronto