# IBM Data Science Certificate - Capstone Project - Segmenting and Clustering 3/3

## Table of content:
* [1. Scrap toronto neighborhoods from wikipedia](#first-part)
* [2. Add geo coordinates to Toronto neighborhoods](#second-part)
* [3. Exploration and clustering of the Toronto  neighborhoods](#third-part)

## 1. Scrap toronto neighborhoods from wikipedia <a class="anchor" id="first-part">

In [117]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

In [63]:
# read the wikipedia page containing postal codes for canada satrting with letter M
wikipedia_canada_post_codes_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikipedia_canada_post_codes_page =  requests.get(wikipedia_canada_post_codes_url)

In [64]:
# extract data from the html table containing the postal codes and put them in
# a pandas dat frame : df
soup_post_codes = BeautifulSoup(wikipedia_canada_post_codes_page .content, 'html.parser')
table_post_codes = soup_post_codes.find("table", {"class": "wikitable"})
df=pd.read_html(str(table_post_codes))[0] #read_html returns a list of dataframes, we take the first (and only) one
print(df)

    Postcode           Borough           Neighborhood
0        M1A      Not assigned           Not assigned
1        M2A      Not assigned           Not assigned
2        M3A        North York              Parkwoods
3        M4A        North York       Victoria Village
4        M5A  Downtown Toronto           Harbourfront
..       ...               ...                    ...
282      M8Z         Etobicoke              Mimico NW
283      M8Z         Etobicoke     The Queensway West
284      M8Z         Etobicoke  Royal York South West
285      M8Z         Etobicoke         South of Bloor
286      M9Z      Not assigned           Not assigned

[287 rows x 3 columns]


In [65]:
# remove rows where Borough is 'Not assigned'
df = df[df.Borough != 'Not assigned']
df.reset_index(drop=True, inplace=True)
# if a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
df['Neighborhood'] = np.where(df['Neighborhood'] == 'Not assigned', df['Borough'], df['Neighborhood'])
df

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
...,...,...,...
205,M8Z,Etobicoke,Kingsway Park South West
206,M8Z,Etobicoke,Mimico NW
207,M8Z,Etobicoke,The Queensway West
208,M8Z,Etobicoke,Royal York South West


In [66]:
df.shape

(210, 3)

## 2. Add geo coordinates to Toronto neighborhoods <a class="anchor" id="second-part">

In [67]:
#read csv file containing geo coordinates
df_geo = pd.read_csv('./data/Geospatial_data.csv')
#make sure that the column containing postal codes has the same name in df and df_geo datframes
df_geo=df_geo.rename(columns={'Postal Code':'Postcode' })
df_geo

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [68]:
#make an inner join between df and df_geo
df=pd.merge(df, df_geo)
df

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,Lawrence Manor,43.718518,-79.464763
...,...,...,...,...,...
205,M8Z,Etobicoke,Kingsway Park South West,43.628841,-79.520999
206,M8Z,Etobicoke,Mimico NW,43.628841,-79.520999
207,M8Z,Etobicoke,The Queensway West,43.628841,-79.520999
208,M8Z,Etobicoke,Royal York South West,43.628841,-79.520999


## 3. Exploration, analysis and clustering of the Toronto  neighborhoods<a class="anchor" id="third-part">

### 3.1 Exploration of the Toronto  neighborhoods

In [76]:
#Foursquare credentials
CLIENT_ID = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
CLIENT_SECRET = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
VERSION = '20180605' 
LIMIT = 100 

**We'll study only the boroughs which contain the name 'Toronto' :**

In [77]:
df_toronto=df[df['Borough'].str.contains('Toronto', regex=False)]
df_toronto

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
6,M9A,Downtown Toronto,Queen's Park,43.667856,-79.532242
12,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
13,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
26,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
...,...,...,...,...,...
189,M4X,Downtown Toronto,St. James Town,43.667967,-79.367675
190,M5X,Downtown Toronto,First Canadian Place,43.648429,-79.382280
191,M5X,Downtown Toronto,Underground city,43.648429,-79.382280
195,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160


**Define a method to get venues for each neighborhood:**

In [78]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

**Apply this method to the toronto data frame :** 

In [79]:
toronto_venues = getNearbyVenues(names=df_toronto['Neighborhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )

Harbourfront
Queen's Park
Ryerson
Garden District
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide
King
Richmond
Dovercourt Village
Dufferin
Harbourfront East
Toronto Islands
Union Station
Little Portugal
Trinity
The Danforth West
Riverdale
Design Exchange
Toronto Dominion Centre
Brockton
Exhibition Place
Parkdale Village
The Beaches West
India Bazaar
Commerce Court
Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North
Forest Hill West
High Park
The Junction South
North Toronto West
The Annex
North Midtown
Yorkville
Parkdale
Roncesvalles
Davisville
Harbord
University of Toronto
Runnymede
Swansea
Moore Park
Summerhill East
Chinatown
Grange Park
Kensington Market
Deer Park
Forest Hill SE
Rathnelly
South Hill
Summerhill West
CN Tower
Bathurst Quay
Island airport
Harbourfront West
King and Spadina
Railway Lands
South Niagara
Rosedale
Stn A PO Boxes 25 The Esplanade
Cabbagetown
St. James Town
First Canadian Place
Underground city

**Size of the resulting data frame :**

In [84]:
print(toronto_venues.shape)
toronto_venues.head()

(3199, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Harbourfront,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Harbourfront,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Harbourfront,43.65426,-79.360636,Cooper Koo Family YMCA,43.653191,-79.357947,Gym / Fitness Center
3,Harbourfront,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,Harbourfront,43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


**Number of venues returned for each neighborhood:**

In [88]:
toronto_venues.groupby('Neighborhood').count()[['Venue']]

Unnamed: 0_level_0,Venue
Neighborhood,Unnamed: 1_level_1
Adelaide,100
Bathurst Quay,16
Berczy Park,57
Brockton,22
Business Reply Mail Processing Centre 969 Eastern,19
...,...
Underground city,100
Union Station,100
University of Toronto,35
Victoria Hotel,100


**Number of categories from al the returned venues:**

In [89]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 233 uniques categories.


### 3.2 Analysis of the Toronto  neighborhoods

In [162]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**New dataframe size :**

In [163]:
toronto_onehot.shape

(3199, 233)

**Group rows by neighborhood and take the mean of the frequency of occurrence of each category :**

In [164]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,Adelaide,0.000000,0.0,0.0000,0.0000,0.0000,0.000,0.000,0.000,0.020000,...,0.0,0.0,0.0,0.00,0.020000,0.000000,0.0,0.01,0.0,0.0
1,Bathurst Quay,0.000000,0.0,0.0625,0.0625,0.0625,0.125,0.125,0.125,0.000000,...,0.0,0.0,0.0,0.00,0.000000,0.000000,0.0,0.00,0.0,0.0
2,Berczy Park,0.000000,0.0,0.0000,0.0000,0.0000,0.000,0.000,0.000,0.000000,...,0.0,0.0,0.0,0.00,0.017544,0.000000,0.0,0.00,0.0,0.0
3,Brockton,0.000000,0.0,0.0000,0.0000,0.0000,0.000,0.000,0.000,0.000000,...,0.0,0.0,0.0,0.00,0.000000,0.000000,0.0,0.00,0.0,0.0
4,Business Reply Mail Processing Centre 969 Eastern,0.052632,0.0,0.0000,0.0000,0.0000,0.000,0.000,0.000,0.000000,...,0.0,0.0,0.0,0.00,0.000000,0.000000,0.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,Underground city,0.000000,0.0,0.0000,0.0000,0.0000,0.000,0.000,0.000,0.020000,...,0.0,0.0,0.0,0.01,0.010000,0.000000,0.0,0.01,0.0,0.0
68,Union Station,0.000000,0.0,0.0000,0.0000,0.0000,0.000,0.000,0.000,0.000000,...,0.0,0.0,0.0,0.01,0.010000,0.000000,0.0,0.01,0.0,0.0
69,University of Toronto,0.000000,0.0,0.0000,0.0000,0.0000,0.000,0.000,0.000,0.000000,...,0.0,0.0,0.0,0.00,0.000000,0.028571,0.0,0.00,0.0,0.0
70,Victoria Hotel,0.000000,0.0,0.0000,0.0000,0.0000,0.000,0.000,0.000,0.030000,...,0.0,0.0,0.0,0.00,0.010000,0.000000,0.0,0.01,0.0,0.0


**Data frame containing the top 10 most common venues for each enighborhood:**

In [165]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [166]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Café,Steakhouse,Thai Restaurant,Salad Place,Burger Joint,Bar,Bakery,Sushi Restaurant,Asian Restaurant
1,Bathurst Quay,Airport Terminal,Airport Lounge,Airport Service,Boutique,Rental Car Location,Coffee Shop,Boat or Ferry,Sculpture Garden,Bar,Airport Gate
2,Berczy Park,Coffee Shop,Cocktail Bar,Steakhouse,Cheese Shop,Beer Bar,Farmers Market,Bakery,Café,Seafood Restaurant,Irish Pub
3,Brockton,Café,Breakfast Spot,Coffee Shop,Furniture / Home Store,Burrito Place,Italian Restaurant,Stadium,Intersection,Restaurant,Bar
4,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Gym / Fitness Center,Garden Center,Skate Park,Restaurant,Recording Studio,Pizza Place,Park,Garden,Spa


### 3.2 Toronto  neighborhoods clustering

**Run k-means to cluster the neighborhood into 5 clusters :**

In [167]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 3, 1, 1, 1, 3, 1, 1, 1, 1], dtype=int32)

**New dataframe that includes the cluster as well as the top 10 venues for each neighborhood :**

In [168]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

# remove rows with NA values
toronto_merged.dropna(inplace=True)

# convert 'Cluster Labels' column type from float to int: 
convert_dict = {'Cluster Labels': int} 
toronto_merged = toronto_merged.astype(convert_dict) 

toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,1,Coffee Shop,Bakery,Pub,Park,Mexican Restaurant,Breakfast Spot,Café,Restaurant,Cosmetics Shop,Shoe Store
12,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937,1,Coffee Shop,Clothing Store,Cosmetics Shop,Middle Eastern Restaurant,Café,Restaurant,Fast Food Restaurant,Lingerie Store,Bubble Tea Shop,Pizza Place
13,M5B,Downtown Toronto,Garden District,43.657162,-79.378937,1,Coffee Shop,Clothing Store,Cosmetics Shop,Middle Eastern Restaurant,Café,Restaurant,Fast Food Restaurant,Lingerie Store,Bubble Tea Shop,Pizza Place
26,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1,Coffee Shop,Café,Restaurant,Italian Restaurant,Bakery,Hotel,Clothing Store,Breakfast Spot,Diner,Park
35,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Asian Restaurant,Pub,Health Food Store,Trail,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Dumpling Restaurant


**Visualization of the clustering:**

In [170]:
toronto_latitude = 43.7001100
tornto_longitude = -79.4163000
# create map
map_clusters = folium.Map(location=[toronto_latitude, tornto_longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters