<H1> Capstone Project Notebook <H1>

In [2]:
import numpy as np
import pandas as pd

In [3]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


<B>Part 1<B>

In [4]:
#!conda install -c conda-forge lxml --yes #Install LXML package to use with Pandas HTML parser.
#!conda install -c conda-forge folium=0.5.0 --yes #Install Folium to draw maps

In [5]:
from bs4 import BeautifulSoup 
pd.options.mode.chained_assignment = None #disable chained warnings for Pandas
pd.set_option('display.max_columns', None) #remove length restrictions
pd.set_option('display.max_rows', None) #remove width restrictions

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans #import k-means from clustering stage

import folium # map rendering library

In [6]:
#Data sources
WIKI_URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
GEODATA_CSV = 'https://cocl.us/Geospatial_data'

#Toronto coordinates
TORONTO_LAT = 43.70011
TORONTO_LONG = -79.4163

#Foursquare parameters
LIMIT = 100
RADIUS = 500
CLIENT_ID = 'NME1NZ1B123D0SUEAGM3BK0JRDKUOA05U2PT2H4HOAM031NU' 
CLIENT_SECRET = '2CGJZO4KGCZSXBI0STX04BUGUQN45ZPZY2R1KPQCPG1ST3XF' 
VERSION = '20190425'
#Foursquare API URL templates
FS_EXPLORE_URL = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'


In [7]:
#extract data from wiki link to fetch the html data in df
req = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

soup = BeautifulSoup(req.content,'lxml')

table = soup.find_all('table')[0]

df = pd.read_html(str(table))

neighborhood=pd.DataFrame(df[0]) 

In [8]:
#Remove all rows where 'Borough' equals 'Not assigned'
neighborhood_data = neighborhood[neighborhood['Borough'] != 'Not assigned'].reset_index(drop=True)

#Copy borough value to neighbourhood if neighbourhood is not assigned
neighborhood_data.loc[neighborhood_data.Neighbourhood == 'Not assigned', 'Neighbourhood'] = neighborhood_data['Borough']

#Group frame by neighborhood_data and concatenate neighbourhoods
toronto_neighborhood = neighborhood_data.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
toronto_neighborhood

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [9]:
toronto_neighborhood.shape

(103, 3)

<B>Part2<B>

In [10]:
#Load data from CSV
geo_data = pd.read_csv(GEODATA_CSV)

#Print some stats
print('Data loaded. Rows: ', geo_data.shape[0])
print('Columns: ', geo_data.columns)

Data loaded. Rows:  103
Columns:  Index(['Postal Code', 'Latitude', 'Longitude'], dtype='object')


Now, we can join two frames and check the result. Luckily, there is a merge() method in Pandas, which is very much SQL-like:

In [11]:
#Merge frames into new frame using zipcode as a key
geo_zipcodes = pd.merge(
    toronto_neighborhood, geo_data,
    how='inner',
    left_on = 'Postcode', right_on = 'Postal Code')

#Drop unwanted 'Postal Code' column
geo_zipcodes.drop('Postal Code', axis=1, inplace=True)

Let's have look at the combined frame:

In [12]:
geo_zipcodes.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


<b>Part 3<b>

Let's create a map of Toronto boroughs:

In [13]:
toronto_map = folium.Map(location=[TORONTO_LAT, TORONTO_LONG], zoom_start=10)

#add boroughs markers to the map
for lat, lng, borough, neighbourhood in zip(geo_zipcodes['Latitude'], geo_zipcodes['Longitude'], geo_zipcodes['Borough'], geo_zipcodes['Neighbourhood']):
    label = 'Borough: {}. Neigbourhoods: {}'.format(borough, neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng], radius=5, popup=label, color='blue', fill=True, fill_color='#3186cc', fill_opacity=0.7, parse_html=False).add_to(toronto_map)
    
toronto_map

Next, prepare couple of functions which will help us later:
* function that will get nearby venues for a list of neigbourhoods — `getNearbyVenues()`
* function to sort venues in descending order — `return_most_common_venues()`

In [14]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    processed_hoods = []
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = FS_EXPLORE_URL.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT) #create the API request URL
        results = requests.get(url).json()["response"]['groups'][0]['items'] #make the GET request 
        # return only relevant information for each nearby venue
        venues_list.append([(name, lat, lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        processed_hoods.append(name)
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 'Neighbourhood Latitude', 'Neighbourhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    print('Processed {} neighbourhoods.'.format(len(processed_hoods), processed_hoods))
    return(nearby_venues)

In [15]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)    
    return row_categories_sorted.index.values[0:num_top_venues]

In [16]:
toronto_venues = getNearbyVenues(names=geo_zipcodes['Neighbourhood'],
                                   latitudes=geo_zipcodes['Latitude'],
                                   longitudes=geo_zipcodes['Longitude'])

Processed 103 neighbourhoods.


Before further actions we need to prepare data and convert all string category values to numbers using one-hot encoding:

In [17]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

In [18]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()

All this data should go into its own dataframe:

In [19]:
num_top_venues = 10 #only using 10 most frequent categories
indicators = ['st', 'nd', 'rd'] #numeric suffixes to beautify column names

#Create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
        
#Create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

We're ready to perform some clustering magic and combine clusting results with our dataframe:

In [20]:
kclusters = 5 #Set number of clusters

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering) #Run k-means clustering
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_) #Add clustering labels
toronto_merged = geo_zipcodes

#Merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood', how='inner')

And, finally, we have a map of our neigbourhoods clustered by venues:

In [21]:
map_clusters = folium.Map(location=[TORONTO_LAT, TORONTO_LONG], zoom_start=11) #Create a map of Toronto

#Set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#Add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker([lat, lon], radius=5, popup=label, color=rainbow[cluster - 1], fill=True, fill_color=rainbow[cluster - 1], fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<H2>Thanks a lot!!!<H2>