<h2>Segmenting and Clustering Neighborhoods in Toronto<h2>
<h3>Applied Data Science Capstone - week 3

<h5>Part 1: Getting + cleaning the data<h5>

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json
import requests
from bs4 import BeautifulSoup as bsoup

In [2]:
req = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [3]:
soup = bsoup(req.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
neighborhood = pd.DataFrame(df[0])

In [4]:
neighborhood.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


<h4>Note: we will not be using any Borough with value "Not assigned"<h4>

In [5]:
selct_neigh = neighborhood[(neighborhood['Borough'] != 'Not assigned') & (neighborhood['Borough'] != 'NaN')]

<h5>Clean up data<h5>

In [6]:
selct_neigh = selct_neigh.copy()
selct_neigh['Neighborhood'] = selct_neigh['Neighborhood'].str.replace(" /",",")
selct_neigh.head(11)

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [7]:
selct_neigh.shape

(103, 3)

<h3>Part 2: Get Neghborhoods' Geographical Coordinates<h3>

In [8]:
!pip -q install geocoder
import geocoder

In [9]:
# make list of postal codes
post_codes = list(selct_neigh['Postal code'])

# initialize latitude, longitude lists
lats = []
lngs = []

In [10]:
# initialize coordinate variable to None
#lat_lng_coords = None

# loop though post_codes
#for pc in post_codes:
    # loop until you get the coordinates
#    while(lat_lng_coords is None):
#        g = geocoder.google(post_codes[0])
#        lat_lng_coords = g.latlng
    # split lat_lng_coords & append to respective list
#    [lat, lng] = lat_lng_coords
#    lats.append(lat)
#    lngs.append(lng)


<h5>As geocoder did not work for me, I used the workaround.<h5>

In [11]:
!wget -q -O 'geographical_coordiantes.csv' https://cocl.us/Geospatial_data
print("got geographical_coordiantes.csv")

got geographical_coordiantes.csv


In [12]:
geo_coords_df = pd.read_csv('geographical_coordiantes.csv')
geo_coords_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
# extract geo-coords
for pc in post_codes:
    gc = geo_coords_df[['Latitude', 'Longitude']][geo_coords_df['Postal Code'] == pc].reset_index()
    # note: index at list(gc.loc[0])[0]
    lats.append(list(gc.loc[0])[1])
    lngs.append(list(gc.loc[0])[2])

In [14]:
print("I got", len(lats), "latitudes.")
print("I got", len(lngs), "longitudes.")
print("I should have", selct_neigh.shape[0], "of each.")

I got 103 latitudes.
I got 103 longitudes.
I should have 103 of each.


In [15]:
# adding Latitudes and Longitudes to our dataframe
selct_neigh['Latitude'] = lats
selct_neigh['Longitude'] = lngs

In [16]:
selct_neigh.head(11)

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
8,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
9,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
11,M3B,North York,Don Mills,43.745906,-79.352188
12,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
13,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
