# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

### 1. Import libraries

In [3]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

from bs4 import BeautifulSoup # for scraping html docs
import urllib.request # for opening url 

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.0.2p             |       h470a237_1         3.1 MB  conda-forge
    certifi-2018.10.15         |        py36_1000         138 KB  conda-forge
    geopy-1.17.0               |             py_0          49 KB  conda-forge
    ca-certificates-2018.10.15 |       ha4d7672_0         135 KB  conda-forge
    conda-4.5.11               |        py36_1000         651 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.1 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0            conda-forge
    geopy:           

### 2. Obtain and scrape data from the Wikipedia page

First we get a html object from the wiki url using urllib.request then parse it in the BeautifulSoup

In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
file = urllib.request.urlopen(url)

In [5]:
html_doc = file
soup = BeautifulSoup(html_doc, 'html.parser')

In [6]:
# define the dataframe columns
column_names = ['Postalcode', 'Borough', 'Neighborhood'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

Goes through the table and save the data into the Dataframe we've created

In [7]:
for i in range(1,len(soup.table.find_all('tr'))):
    row = soup.table.find_all('tr')[i]
    postcode = row.find_all('td')[0].get_text()
    borough = row.find_all('td')[1].get_text()
    neighborhood = row.find_all('td')[2].get_text().strip('\n')
    neighborhoods = neighborhoods.append({'Postalcode': postcode,'Borough': borough,'Neighborhood': neighborhood}, ignore_index=True)

Arrange 'Not assigned' columns

In [8]:
# Drop 'Not assigned' Boroughs
neighborhoods = neighborhoods[neighborhoods.Borough != 'Not assigned']
neighborhoods.reset_index(drop=True, inplace = True)

# Assign 'Not assigned' neighborhoods to its Borough
for i in range(neighborhoods.shape[0]):
    if neighborhoods.iloc[i,2]=='Not assigned':
        neighborhoods.iloc[i,2] = neighborhoods.iloc[i,1]

neighborhoods.head(20)


Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


Grouping Neighborhoods and merging with Borough

In [9]:
# Group neighborhoods
temp = neighborhoods
temp = temp.groupby(['Postalcode']).agg({'Neighborhood':(', '.join)})

# Joining Borough
neighborhoods.set_index('Postalcode', inplace=True)
temp = temp.join(neighborhoods[['Borough']])

# Rearrange columns
temp.reset_index(inplace=True)
result = temp[['Postalcode','Borough','Neighborhood' ]]

Drop duplicates and arrange things around 

In [10]:
# Drop duplicates
result.drop_duplicates(subset=None, keep='first', inplace=True)
result.reset_index(drop=True, inplace = True)
result.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [11]:
print(result.shape)

(103, 3)


### 3. Obtain latitudes and longitudes of each postal code

Let's try to get single latitude and longitude first.

In [30]:
postalcode_data = pd.read_csv("Geospatial_Coordinates.csv")

In [37]:
print(postalcode_data.shape)
postalcode_data.head()

(103, 3)


Unnamed: 0,Postalcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [35]:
postalcode_data.rename(columns={'Postal Code': 'Postalcode'}, inplace=True)

In [38]:
result = result.merge(postalcode_data, on='Postalcode')

In [39]:
result.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
