## Coursera Capstone Project

#### 1. We read a table from Wikipedia that describes postal codes and neighborhoods in Toronto, Canada

In [1]:
import pandas as pd
import numpy as np
import folium
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors



df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df.columns = df.iloc[0]
df = df.drop(df.index[0])

#### 2. We drop all Boroughs that are labeled "Not assigned"

In [2]:
df = df.drop(df[(df['Borough'] == "Not assigned")].index)

#### 3. If a row has a borough but a "Not assigned" neighborhood value, then the neighborhood will be assigned the borough

In [3]:
df.loc[df['Neighborhood'].eq("Not assigned"),"Neighborhood"]= df.loc[df['Neighborhood'].eq("Not assigned"),'Borough']

#### 4.  We combine the cells of the "Neighborhood" column if they have the same Postal Code

In [4]:
df = df.groupby(['Postal Code'])['Neighborhood'].apply(', '.join).reset_index()

#### 5. Let´s check the shape of our cleaned up df

In [5]:
np.shape(df)

(103, 2)

#### 6. For mapping the neighborhoods with the folium package we need the corresponding coordinates for each one.

In [6]:
df.head()

Unnamed: 0,Postal Code,Neighborhood
0,M1B,"Malvern, Rouge"
1,M1C,"Rouge Hill, Port Union, Highland Creek"
2,M1E,"Guildwood, Morningside, West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae


In [7]:
geo_file = 'https://cocl.us/Geospatial_data'
geo_df = pd.read_csv(geo_file)

geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [8]:
## Merging the two dataframes
df = df.merge(geo_df, how= 'inner', on= 'Postal Code')

#### 7. We find the coordinates of Toronto, create a map and fill it with our neighborhoods.

In [9]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [10]:
mapToronto = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lon, label in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.8,
        parse_html=False).add_to(mapToronto)  

In [11]:
 mapToronto

#### 7. To cluster the neighborhoods we use k-means and split them up in 5 different clusters

In [12]:
k_means = KMeans(init="k-means++", n_clusters=5, n_init=5)
toronto_points = df[['Longitude','Latitude']]
k_means.fit(toronto_points)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=5, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [13]:
kmeans = KMeans(n_clusters = 5, init ='k-means++')
kmeans.fit(df[["Longitude", "Latitude"]]) # Compute k-means 
df['Cluster'] = kmeans.fit_predict(df[["Longitude", "Latitude"]])
centers = kmeans.cluster_centers_ 

In [14]:
df.head(12)

Unnamed: 0,Postal Code,Neighborhood,Latitude,Longitude,Cluster
0,M1B,"Malvern, Rouge",43.806686,-79.194353,1
1,M1C,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,1
2,M1E,"Guildwood, Morningside, West Hill",43.763573,-79.188711,1
3,M1G,Woburn,43.770992,-79.216917,1
4,M1H,Cedarbrae,43.773136,-79.239476,1
5,M1J,Scarborough Village,43.744734,-79.239476,1
6,M1K,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029,1
7,M1L,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577,3
8,M1M,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476,1
9,M1N,"Birch Cliff, Cliffside West",43.692657,-79.264848,3


In [15]:
mapToronto = folium.Map(location=[latitude, longitude], zoom_start=11)

kclusters=5

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, label, cluster in zip(df['Latitude'], df['Longitude'], df['Neighborhood'], df['Cluster']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(mapToronto)

In [16]:
mapToronto