# Project 2   
## Segmenting and Clustering Neighborhoods in Toronto
This notebook is for Coursera training "IBM Data Science", Course 9, "Applied Data Science Capstone", Project 2


In [1]:
import numpy as np
import pandas as pd

Read DataFrame from Wikipedia webpage :

In [2]:
df1 = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]  # Read from Wikipedia. list[0] is df
df1.head()


Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Remove all rows that 'Borough' = 'Not assigned': (assuming only first letter is Uppercase)

In [3]:
df1 = df1[df1['Borough']!='Not assigned']
df1.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [4]:
# For all rows with the same 'Postcode','Borough', combine their 'Neighborhood' into 1 line
df1 = df1.groupby(['Postcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
df1.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [5]:
# For those rows with ['Neighborhood'] =='Not assigned , replace by ['Borough']
df1['Neighborhood'].loc[df1['Neighborhood'] =='Not assigned'] = df1['Borough']
df1[df1['Neighborhood'] =='Not assigned']   # No rows  ==> this is done.


Unnamed: 0,Postcode,Borough,Neighborhood


In [6]:
df1.shape

(103, 3)

### Find Latitude and Longitude:

In [7]:
%pip install geocoder

Note: you may need to restart the kernel to use updated packages.


In [8]:
import geocoder # import geocoder

In [9]:
# This Code does not work:

# initialize your variable to None

# df1.head()
# # loop until you get the coordinates
# for idx, row in df1.iterrows():
#     postal_code = row['Postcode']
#     lat_lng_coords = None
#     while(lat_lng_coords is None):
#         g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#         lat_lng_coords = g.latlng

#     row['Latitude'] = lat_lng_coords[0]
#     row['Longtitude'] =  lat_lng_coords[1]


In [10]:
df2=pd.read_csv("Geospatial_Coordinates.csv")
df2.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
df_3 =  pd.merge( df1, df2, how='left', left_on=['Postcode'], right_on = ['Postal Code'])


In [12]:
df_3.drop('Postal Code', axis = 1)
df_3.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


### Clustering

In [13]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = df_3[['Latitude','Longitude']]

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
# check cluster labels generated for each row in the dataframe
df_3['Cluster Labels'] = kmeans.labels_



In [14]:
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import matplotlib.cm as cm
import matplotlib.colors as colors

# # add clustering labels
# neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

# toronto_merged = toronto_data

# # merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
# toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

# toronto_merged.head() # check the last columns!

address = 'Toronto, Ontario, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_3['Latitude'], df_3['Longitude'], df_3['Neighborhood'], df_3['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters
