# Toronto Neighborhood Clustering - IBM Data Science Capstone
#### by fil.coutinho

## 1. Dataframe Preparation

In [71]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [72]:
df_toronto.rename(columns = {'Postal Code':'PostalCode'}, inplace = True)

#Drop 'Not Assigned' rows
df_toronto.drop(df_toronto.loc[df_toronto['Borough']=='Not assigned'].index, inplace=True)
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [73]:
count_postal = df_toronto.groupby(['PostalCode']).count()
count_postal.head()

Unnamed: 0_level_0,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,1,1
M1C,1,1
M1E,1,1
M1G,1,1
M1H,1,1


In [74]:
#Check for duplicate postal codes
count_postal.loc[count_postal['Neighborhood'] > 1]

Unnamed: 0_level_0,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1


In [75]:
# Check for postal code with Neighborhood not assigned but Borough existent
df_toronto.loc[df_toronto['Neighborhood']=='Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


In [76]:
df_toronto.shape

(103, 3)

## 2. Dataframe Preparation

In [77]:
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [78]:

body = client_61bf4a96af25446cb70fbd99af7c40a1.get_object(Bucket='ibmdatasciencecapstone-donotdelete-pr-butz1ybluhym17',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

# If you are reading an Excel file into a pandas DataFrame, replace `read_csv` by `read_excel` in the next statement.
df_geo = pd.read_csv(body)
df_geo.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [79]:
#Imported df_geo from the lat/long CSV

df_geo.rename(columns = {'Postal Code':'PostalCode'}, inplace = True)
df_toronto_geo = pd.merge(df_toronto, df_geo, on='PostalCode')
df_toronto_geo.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## 3. Neighborhood Clustering (using K-means)

In [80]:
#Import necessary libraries
from sklearn.cluster import KMeans
#!pip install folium
import folium # map rendering library


In [81]:
#Initiate K-Means
kclusters = 5
kmeans = KMeans(n_clusters=kclusters, random_state=0)

toronto_cluster = df_toronto_geo.drop(['PostalCode','Borough','Neighborhood'], axis=1)
toronto_cluster.head()
 

Unnamed: 0,Latitude,Longitude
0,43.753259,-79.329656
1,43.725882,-79.315572
2,43.65426,-79.360636
3,43.718518,-79.464763
4,43.662301,-79.389494


In [82]:
#fit k-means algorithm to the location coordinates (cluster by location)
kmeans.fit(toronto_cluster)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([4, 4, 2, 3, 2, 1, 0, 4, 4, 2], dtype=int32)

In [83]:
# add clustering labels
df_toronto_geo.insert(5, 'Cluster Labels', kmeans.labels_, allow_duplicates=True)
df_toronto_geo.head()


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels
0,M3A,North York,Parkwoods,43.753259,-79.329656,4
1,M4A,North York,Victoria Village,43.725882,-79.315572,4
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,2
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,3
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,2


In [87]:
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

latitude = 43.651070
longitude = -79.347015

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto_geo['Latitude'], df_toronto_geo['Longitude'], df_toronto_geo['Neighborhood'], df_toronto_geo['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters