## Read data from wikilink to dataframe

In [2]:
import pandas as pd
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [4]:
df = pd.read_html(url, header = 0)

In [5]:
raw_data = df[0]
raw_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [6]:
#drop boroughs = Not assigned
processed_data = raw_data[(raw_data['Borough'] == "Not assigned") == False] 

#reset index after dropping rows
pr_data = processed_data.reset_index() 
pr_data.drop("index", axis = 1, inplace = True)
pr_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
pr_data.shape

(103, 3)

## Import geospatial data and merge with location list

In [8]:
#import geospatial data
url2 = 'https://cocl.us/Geospatial_data'
df = pd.read_csv(url2)
df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
#merge data
result = pr_data.set_index('Postal Code').join(df.set_index('Postal Code'))
result.head()

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [10]:
df = result.reset_index()

In [26]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [12]:
import folium # map rendering library

In [21]:
# Create map for the postcodes
lat = 43.65
lng = -79.38
map_canada = folium.Map(location = [lat, lng], zoom_start = 10)
# add markers to map
for lat, lng, borough in zip(df['Latitude'], df['Longitude'], df['Borough']):
    label = '{}'.format(borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_canada)  
map_canada

In [22]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [23]:
import json
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
CLIENT_ID  = 'GDEDXCBEOU3DQVA20TMIETH1I4IKSBLXFKULTJ4OSDBNVLZ3'
CLIENT_SECRET = 'REGYWYREW5GSCFDGRWPHAOQF40GQKWIEQCRFJ3RGQE0L01W5'
VERSION = '20180605'

In [27]:
columns = ['name', 'categories', 'lat', 'lng', 'Postal Code']
ndf = pd.DataFrame( columns=columns) #create empty data frame
ndf
for code, lat, lng in zip (df['Postal Code'], df['Latitude'], df['Longitude']): #loop through data in the df
    radius = 500
    limit = 100
    
    URL = 'https://api.foursquare.com/v2/venues/explore/?&client_id={}&client_secret={}&v={}&ll={},{}&radius{}&limit{}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        lng,
        radius,
        limit
    ) #create URL
    
    result = requests.get(URL).json()
    venues = result['response']['groups'][0]['items']
    nearby_venues = json_normalize(venues)
    nearby_venues = nearby_venues[['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']]
    nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis = 1)

    nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
    nearby_venues['Postal Code'] = code 
    ndf = ndf.append(nearby_venues)# append dataframs for each iterations



In [69]:
ndf.head(45)

Unnamed: 0,name,categories,lat,lng,Postal Code
0,Allwyn's Bakery,Caribbean Restaurant,43.75984,-79.324719,M3A
1,Donalda Golf & Country Club,Golf Course,43.752816,-79.342741,M3A
2,Island Foods,Caribbean Restaurant,43.745866,-79.346035,M3A
3,Graydon Hall Manor,Event Space,43.763923,-79.342961,M3A
4,Galleria Supermarket,Supermarket,43.75352,-79.349518,M3A
5,LA Fitness,Gym,43.747665,-79.347077,M3A
6,Darband Restaurant,Middle Eastern Restaurant,43.755194,-79.348498,M3A
7,Tim Hortons,Café,43.760668,-79.326368,M3A
8,Starbucks Reserve Bar,Coffee Shop,43.735764,-79.344156,M3A
9,Naan & Kabob Halal,Middle Eastern Restaurant,43.742903,-79.305148,M3A


In [67]:
df_new = ndf.reset_index()

In [29]:
df_new.drop("index", axis = 1, inplace = True)

In [30]:
df_new.groupby('Postal Code').count()

Unnamed: 0_level_0,name,categories,lat,lng
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,30,30,30,30
M1C,30,30,30,30
M1E,30,30,30,30
M1G,30,30,30,30
M1H,30,30,30,30
...,...,...,...,...
M9N,30,30,30,30
M9P,30,30,30,30
M9R,30,30,30,30
M9V,30,30,30,30


In [31]:
print('Number of unique categories:{}'.format(len(df_new['categories'].unique())))

Number of unique categories:259


In [32]:
#create one hot encoding
can_onehot = pd.get_dummies(df_new['categories'])


# add post code column back to dataframe
can_onehot['Postal Code'] = df_new['Postal Code'] 

# move neighborhood column to the first column
fixed_columns = [can_onehot.columns[-1]] + list(can_onehot.columns[:-1])
can_onehot = can_onehot[fixed_columns]

can_onehot.head()

Unnamed: 0,Postal Code,Afghan Restaurant,Airport,American Restaurant,Amphitheater,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,...,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
can_onehot.shape

(3090, 260)

In [34]:
can_onehot_grouped = can_onehot.groupby('Postal Code').mean().reset_index()

In [53]:
can_onehot_grouped

Unnamed: 0,Postal Code,Afghan Restaurant,Airport,American Restaurant,Amphitheater,Antique Shop,Aquarium,Art Gallery,Art Museum,Arts & Crafts Store,...,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,M1B,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.066667,0.3
1,M1C,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.066667,0.1
2,M1E,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.033333,0.0,0.000000,0.0
3,M1G,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.033333,0.0,0.000000,0.0
4,M1H,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.033333,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,M9N,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.100000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
99,M9P,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.066667,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
100,M9R,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
101,M9V,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0


## Create Clustering

In [44]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

canada_grouped_clustering = can_onehot_grouped.drop('Postal Code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(canada_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([3, 2, 2, 2, 4, 2, 2, 2, 2, 2, 0, 0, 4, 4, 4, 4, 3, 4, 4, 4, 4, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 4, 0, 4, 0, 0, 4, 0, 4, 2, 4, 1, 4, 2, 0,
       1, 1, 1, 0, 2, 1, 2, 1, 4, 2, 1, 1, 1, 0, 1, 1, 1, 1, 4, 1, 1, 1,
       1, 1, 2, 1, 1, 4, 4, 0, 0, 1, 1, 4, 4, 4, 1, 1, 1, 2, 1, 0, 4, 2,
       2, 4, 1, 2, 4, 1, 1, 4, 4, 4, 4, 4, 4, 0, 4], dtype=int32)

In [58]:
# add clustering labels to postcode
can_onehot_grouped.insert(0, 'Cluster Labels', kmeans.labels_)




In [70]:
cand_grouped = can_onehot_grouped[['Postal Code', 'Cluster Labels']]
cand_grouped.head()

Unnamed: 0,Postal Code,Cluster Labels
0,M1B,3
1,M1C,2
2,M1E,2
3,M1G,2
4,M1H,4


In [63]:
#join to bring back lat long and other details
cand_grouped = cand_grouped.join(df.set_index('Postal Code') , on = 'Postal Code')

In [65]:
cand_grouped

Unnamed: 0,Postal Code,Cluster Labels,Borough,Neighbourhood,Latitude,Longitude
0,M1B,3,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,2,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,2,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,2,Scarborough,Woburn,43.770992,-79.216917
4,M1H,4,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...,...
98,M9N,4,York,Weston,43.706876,-79.518188
99,M9P,4,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,4,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,0,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


In [66]:
import numpy as np # library to handle data in a vectorized manner
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors


# create map
lat = 43.65
lng = -79.38
map_clusters = folium.Map(location=[lat, lng], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(cand_grouped['Latitude'], cand_grouped['Longitude'], cand_grouped['Postal Code'], cand_grouped['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters