In [2]:
import requests
import pandas as pd
import numpy as np
import random
from IPython.display import Image 
from IPython.core.display import HTML 
from pandas.io.json import json_normalize
from bs4 import BeautifulSoup
import urllib.request

# Scraping webpage

In [3]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
table=pd.read_html(url)
table
res = requests.get(" https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
data = df[0]

In [4]:
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Ignoring rows that has 'Not assigned' values

In [5]:
df2 = data[data.Borough != 'Not assigned']
df2 = df2.sort_values(by = ['Postcode', 'Borough'])

df2.reset_index(inplace=True)
df2.drop('index',axis=1,inplace=True)

df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Rouge
1,M1B,Scarborough,Malvern
2,M1C,Scarborough,Highland Creek
3,M1C,Scarborough,Rouge Hill
4,M1C,Scarborough,Port Union


## Joining duplicate values

In [12]:
df2 = df2.groupby(['Postcode','Borough'],as_index=False, sort=False).agg(lambda x: ', '.join(x))
df2['Neighbourhood']=set(df2['Neighbourhood'])
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Emery,Humberlea"
1,M1C,Scarborough,"First Canadian Place,Underground city"
2,M1E,Scarborough,"Fairview,Henry Farm,Oriole"
3,M1G,Scarborough,Cedarbrae
4,M1H,Scarborough,"Moore Park,Summerhill East"


## Shape

In [7]:
df2.shape

(103, 3)

# Geospatial data

In [8]:
geospatial = pd.read_csv('http://cocl.us/Geospatial_data')
geospatial.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
geospatial.rename(columns={'Postal Code': 'Postcode'}, inplace=True)
geospatial.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# Merging and final Dataframe

In [10]:
final_df = pd.merge(df2, geospatial, on='Postcode')
final_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Emery,Humberlea",43.806686,-79.194353
1,M1C,Scarborough,"First Canadian Place,Underground city",43.784535,-79.160497
2,M1E,Scarborough,"Fairview,Henry Farm,Oriole",43.763573,-79.188711
3,M1G,Scarborough,Cedarbrae,43.770992,-79.216917
4,M1H,Scarborough,"Moore Park,Summerhill East",43.773136,-79.239476


In [11]:
final_df.shape

(103, 5)

# Part-4. Data Analysis and Visualization

In [14]:
!conda install -c conda-forge folium=0.5.0 --yes

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    altair-3.2.0               |           py36_0         770 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be 

In [15]:
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

In [17]:
#Toronto coordinates
TORONTO_LAT = 43.70011
TORONTO_LONG = -79.4163

#Foursquare parameters
LIMIT = 100
RADIUS = 500

#Foursquare API URL templates
FS_EXPLORE_URL = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'

In [18]:
# Sensitive codes
CLIENT_ID = 'T2F2DEHKDLPTJGZWQSKHGYU0J1FLIJEGXHML5FUL4QMSY3JF'
CLIENT_SECRET = 'KKRFFC3UCKR00LBGF3EVUHPDKSORULHLTBMLTA02XIYOH4SZ'
VERSION = '20180604'

In [19]:
toronto_map = folium.Map(location=[TORONTO_LAT, TORONTO_LONG], zoom_start=11)

#add boroughs markers to the map
for lat, lng, borough, neighbourhood in zip(final_df['Latitude'], final_df['Longitude'], final_df['Borough'], final_df['Neighbourhood']):
    label = 'Borough: {}. Neigbourhoods: {}'.format(borough, neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng], radius=5, popup=label, color='blue', fill=True, fill_color='#3186cc', fill_opacity=0.7, parse_html=False).add_to(toronto_map)

In [20]:
toronto_map

### Create functions
    1. get_nearby_venues()
    2. most_common_venues()

In [21]:
def get_nearby_venues(names, latitudes, longitudes, radius=500):
    processed_hoods = []
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = FS_EXPLORE_URL.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT) #create the API request URL
        results = requests.get(url).json()["response"]['groups'][0]['items'] #make the GET request        
        # return only relevant information for each nearby venue
        venues_list.append([(name, lat, lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        processed_hoods.append(name)
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 'Neighbourhood Latitude', 'Neighbourhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    print('Processed {} neighbourhoods.'.format(len(processed_hoods), processed_hoods))
    return(nearby_venues)


In [22]:
def most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)    
    return row_categories_sorted.index.values[0:num_top_venues]

### Get nearby venues for Toronto neighborhoods

In [24]:
toronto_venues = get_nearby_venues(names=final_df['Neighbourhood'],
                                   latitudes=final_df['Latitude'],
                                   longitudes=final_df['Longitude'])

Processed 103 neighbourhoods.


In [27]:
toronto_venues.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Emery,Humberlea",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"First Canadian Place,Underground city",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"First Canadian Place,Underground city",43.784535,-79.160497,Scarborough Historical Society,43.788755,-79.162438,History Museum
3,"Fairview,Henry Farm,Oriole",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,"Fairview,Henry Farm,Oriole",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


### One-hot encoding

In [28]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

In [29]:
toronto_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Emery,Humberlea",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"First Canadian Place,Underground city",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"First Canadian Place,Underground city",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Fairview,Henry Farm,Oriole",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Fairview,Henry Farm,Oriole",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()

In [31]:
toronto_grouped.head()

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide,King,Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood,Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
num_top_venues = 10 #only using 10 most frequent categories
indicators = ['st', 'nd', 'rd'] #numeric suffixes to beautify column names

#Create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
        
#Create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

In [34]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Sandwich Place,Dessert Shop,Gym,Sushi Restaurant,Café,Coffee Shop,Italian Restaurant,Pizza Place,Brewery,Farmers Market
1,Agincourt,Coffee Shop,Hotel,Café,Restaurant,American Restaurant,Deli / Bodega,Italian Restaurant,Seafood Restaurant,Steakhouse,Bakery
2,"Agincourt North,L'Amoreaux East,Milliken,Steel...",Rental Car Location,Drugstore,Yoga Studio,Dog Run,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Doner Restaurant
3,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",Cafeteria,Doner Restaurant,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop,College Gym
4,"Alderwood,Long Branch",Pizza Place,Fast Food Restaurant,Pharmacy,Athletics & Sports,Pet Store,Gym / Fitness Center,Breakfast Spot,Gastropub,Intersection,Bank


# Clustering and Map

In [37]:
kclusters = 5 #Set number of clusters

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering) #Run k-means clustering
toronto_merged = final_df

#Merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood', how='inner')

In [38]:
map_clusters = folium.Map(location=[TORONTO_LAT, TORONTO_LONG], zoom_start=11) #Create a map of Toronto

#Set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#Add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker([lat, lon], radius=5, popup=label, color=rainbow[cluster - 1], fill=True, fill_color=rainbow[cluster - 1], fill_opacity=0.7).add_to(map_clusters)
       
map_clusters