## Applied Data Science Capstone - Week3 - Assignment: Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [27]:
# Read the postal codes data file from my Watson cloud storage
file_url = "https://cloud-object-storage-ul-cos-standard-97j.s3.us-east.cloud-object-storage.appdomain.cloud/Postal_codes_of_Canada.csv"

# Read the data file
df_orig = pd.read_csv(file_url)
df_orig.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [14]:
df_orig.shape

(180, 3)

In [4]:
# Delete rows if Borough has 'Not assigned'
df = df_orig[df_orig['Borough'] != 'Not assigned'].reset_index(drop=True)
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [6]:
df.shape

(103, 3)

In [7]:
df.values

array([['M3A', 'North York', 'Parkwoods'],
       ['M4A', 'North York', 'Victoria Village'],
       ['M5A', 'Downtown Toronto', 'Regent Park, Harbourfront'],
       ['M6A', 'North York', 'Lawrence Manor, Lawrence Heights'],
       ['M7A', 'Downtown Toronto',
        "Queen's Park, Ontario Provincial Government"],
       ['M9A', 'Etobicoke', 'Islington Avenue, Humber Valley Village'],
       ['M1B', 'Scarborough', 'Malvern, Rouge'],
       ['M3B', 'North York', 'Don Mills'],
       ['M4B', 'East York', 'Parkview Hill, Woodbine Gardens'],
       ['M5B', 'Downtown Toronto', 'Garden District, Ryerson'],
       ['M6B', 'North York', 'Glencairn'],
       ['M9B', 'Etobicoke',
        'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale'],
       ['M1C', 'Scarborough', 'Rouge Hill, Port Union, Highland Creek'],
       ['M3C', 'North York', 'Don Mills'],
       ['M4C', 'East York', 'Woodbine Heights'],
       ['M5C', 'Downtown Toronto', 'St. James Town'],
       ['M6C', 'York

In [5]:
# groupby 'Postal Code' and combine Neighbourhood values
df.groupby(['Postal Code'], as_index = False).agg({'Neighbourhood': ', '.join})
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [17]:
df.shape

(103, 3)

#### Check if we have 'Not assigned' assigned to Neighbourhood column

In [6]:
# Check if we have 'Not assigned' assigned to Neighbourhood column
df_na = df[df['Neighbourhood'] == 'Not assigned']
df_na.shape

(0, 3)

#### Since there is no case where 'Not assigned' assigned to the Neighbourhood column, we do not need to run the below code.

In [84]:
# If Neighbourhood value is 'Not assigned', replace it with the value of Borough column
# df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df['Borough']
# df.values

In [7]:
df.shape

(103, 3)

#### Get the location coordinates for each Postal Code in the dataset

In [None]:
!pip install geocoder  # Install Geocoder package

In [8]:
import geocoder

In [9]:
for index, row in df.iterrows():
    # initialize your variable to None
    lat_lng_coords = None
    
    print("Looking up a coordinate for {}...".format(row['Postal Code']))
    
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(row['Postal Code']))
        lat_lng_coords = g.latlng
    
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    print("{} - ({}, {})".format(row['Postal Code'], latitude, longitude))

Looking up a coordinate for M3A...


KeyboardInterrupt: 

### Use the Geospatial_data instead.
[Geospatial_data](https://cocl.us/Geospatial_data)

In [28]:
# Read the geospatial data from the Watson cloud storage
geospatial_url = "https://cloud-object-storage-ul-cos-standard-97j.s3.us-east.cloud-object-storage.appdomain.cloud/Geospatial_Coordinates.csv"

df_geo = pd.read_csv(geospatial_url)
df_geo.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [30]:
# Join the dataframe with Latitude and Longitude
df_merged = pd.merge(left=df, right=df_geo, how='left', left_on='Postal Code', right_on='Postal Code')
df_merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
