## Importing Libraries

In [109]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

## Importing table from webpage to a Dataframe

In [2]:
# retreiving html
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(page.content, 'html.parser')

# retreiving the whole table
table_rows = soup.find('table').find_all('tr')

# headers are in the item of the list
headers = table_rows[0].find_all('th') 
columns = [i.text.replace('\n','') for i in headers]

# retreiving each row
rows = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [i.text.replace('\n','') for i in td]
    rows.append(row)
# first row is empty because of the headers
rows = rows[1:len(rows)]

# importing the raw data into a dataframe
data_frame = pd.DataFrame(rows, columns = columns)

data_frame.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


## Cleaning Data

In [13]:
# removing Borough 'Not assigned'
data_frame = data_frame[data_frame['Borough']!='Not assigned']
data_frame.reset_index(inplace = True, drop = True)

# replacing '/' with ',' to sepatate neighborhoods in the same Postal Code
for i in range(data_frame.shape[0]):
    data_frame['Neighborhood'][i] = data_frame['Neighborhood'][i].replace(' /',',') 
 
print("Number of rows in the data_frame = {}".format(data_frame.shape[0]))
data_frame.head(10)

Number of rows in the data_frame = 103


Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


## Getting the geographical coordinates for each postal code in Toronto

In [142]:
# I decided to import from the csv file because the geocoder package was not working
file = 'Geospatial_Coordinates.csv'
lat_lon = pd.read_csv(file)
lat_lon

# merging dataframes keeping only the Postal Codes that are available in the table extracted from wikipedia
toronto_neigh = pd.merge(data_frame, lat_lon, left_on='Postal code', right_on='Postal Code', how = 'left')
toronto_neigh.drop('Postal Code', axis = 1, inplace = True)

## Retreiving data from Foursquare

### Setting up

In [44]:
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

CLIENT_ID = 'KR4Q154WR4YGD01Q1SOSMWEKKH1WNUXNIIWLHJ1ZZVPY5UIK' # your Foursquare ID
CLIENT_SECRET = 'B2IFOOL0C2YJ0MO2KFYZR2Z0NHMFGKHAYCGNH02ELHGULQTM' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

### Retrieving venue data for all neighborhoods

In [70]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT = 100):
    # function to retrieve data from a list of lat and long
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):       
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [194]:
toronto_venues = getNearbyVenues(toronto_neigh['Neighborhood'],toronto_neigh['Latitude'],toronto_neigh['Longitude'], radius = 1500)
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
2,Parkwoods,43.753259,-79.329656,Donalda Golf & Country Club,43.752816,-79.342741,Golf Course
3,Parkwoods,43.753259,-79.329656,LCBO,43.757774,-79.314257,Liquor Store
4,Parkwoods,43.753259,-79.329656,Tim Hortons,43.760668,-79.326368,Café


## Analizing the Neighborhoods

### one hot encoding venues category

In [195]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

#group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Zoo Exhibit,ATM,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,...,Volleyball Court,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.020833,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.02439,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.013333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.013333,0.0,0.0,0.0,0.0


### Putting data into a dataframe 

In [196]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Chinese Restaurant,Coffee Shop,Cantonese Restaurant,Asian Restaurant,Department Store,Caribbean Restaurant,Shopping Mall,Bakery,Breakfast Spot,Gym / Fitness Center
1,"Alderwood, Long Branch",Park,Pharmacy,Clothing Store,Toy / Game Store,Café,Burger Joint,Pizza Place,Bank,Restaurant,Coffee Shop
2,"Bathurst Manor, Wilson Heights, Downsview North",Park,Coffee Shop,Bank,Pizza Place,Gas Station,Restaurant,Supermarket,Ski Chalet,Baseball Field,Sushi Restaurant
3,Bayview Village,Gas Station,Trail,Park,Bank,Restaurant,Café,Grocery Store,Chinese Restaurant,Japanese Restaurant,Skating Rink
4,"Bedford Park, Lawrence Manor East",Bakery,Coffee Shop,Italian Restaurant,Sushi Restaurant,Bagel Shop,Pub,Asian Restaurant,Café,Fast Food Restaurant,Restaurant


## Clustering Neighborhoods and Plotting in the Map

### Clustering

In [201]:
# set number of clusters
kclusters = 4
#toronto_grouped
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# add clustering labels
neighborhoods_venues_sorted['Cluster Labels'] = kmeans.labels_
neighborhoods_venues_sorted

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = pd.merge(toronto_neigh, neighborhoods_venues_sorted, how = 'inner')

toronto_merged.groupby('Cluster Labels').count()

Unnamed: 0_level_0,Postal code,Borough,Neighborhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12
1,48,48,48,48,48,48,48,48,48,48,48,48,48,48,48
2,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


### Plotting

In [205]:
# create map

latitude = 43.753259
longitude = -79.329656

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

The map is not showing up on github, so I uploaded a picture at this link:

https://github.com/gustavosalgado86/Coursera_Capstone/blob/master/Toronto%20Clustered.PNG