# The process:
### 1. Get geo data
### 2. Get venues data
### 3. Clustering
### 4. Visualization

# Part1: Get the geo data

In [106]:
import geocoder
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
# to read the wiki information about zip code
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

table = soup.find(soup.find("table",{"class":"wikitable sortable jquery-tablesorter"}))
table_body = table.find('tbody')
rows = table_body.find_all('tr')

PostalCode = list()
Borough = list()
Neighborhood = list()
for row in rows:
    column = list() 
    cols = row.find_all('td')
    for col in cols:
        column.append(col.get_text(strip=True))

    if len(column) != 0:
        if column[1] != "Not assigned":
            PostalCode.append(column[0])
            Borough.append(column[1])
            Neighborhood.append(column[2])
df = pd.DataFrame({"PostalCode": PostalCode, "Borough": Borough, "Neighborhood": Neighborhood}) 
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing Centre
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [107]:
# check our dataframe's shape
print(df.shape)

(103, 3)


In [28]:
# read the geo data about latitude and longitude
coords = pd.read_csv("Geospatial_coordinates.csv")
print(coords.shape)

# concate the postal codes and geo data
df = pd.concat([df, coords], axis=1)
df


(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M1B,43.806686,-79.194353
1,M4A,North York,Victoria Village,M1C,43.784535,-79.160497
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",M1E,43.763573,-79.188711
3,M6A,North York,"Lawrence Manor, Lawrence Heights",M1G,43.770992,-79.216917
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M1H,43.773136,-79.239476
...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",M9N,43.706876,-79.518188
99,M4Y,Downtown Toronto,Church and Wellesley,M9P,43.696319,-79.532242
100,M7Y,East Toronto,Business reply mail Processing Centre,M9R,43.688905,-79.554724
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",M9V,43.739416,-79.588437


# Part2: Get venues data

In [29]:
import requests # library to handle requests
import numpy as np

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

In [34]:
# enter fourquare account data
CLIENT_ID = 'NWKSSSQR2ELNRHQMJX02G3WH2LXN2WL2KFZ0GHGA54IEL2C0' # your Foursquare ID
CLIENT_SECRET = 'FZDQ35DIDHUYOGEO5DYU1MFPXXUJ5ZIEXMMKVZDVLHRPAZW1' # your Foursquare Secret
VERSION = '20180605'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: NWKSSSQR2ELNRHQMJX02G3WH2LXN2WL2KFZ0GHGA54IEL2C0
CLIENT_SECRET:FZDQ35DIDHUYOGEO5DYU1MFPXXUJ5ZIEXMMKVZDVLHRPAZW1


In [35]:
# search venues information according to geo data
names=df['Neighborhood']
latitudes=df['Latitude']
longitudes=df['Longitude']
radius=1000
LIMIT=100
venues_list=[]
for name, lat, lng in zip(names, latitudes, longitudes):
    print(name)
            
    # create the API request URL
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        lng, 
        radius, 
        LIMIT)
            
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
        
    # return only relevant information for each nearby venue
    venues_list.append([(
        name, 
        lat, 
        lng, 
        v['venue']['name'], 
        v['venue']['location']['lat'], 
        v['venue']['location']['lng'],  
        v['venue']['categories'][0]['name']) for v in results])

nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmount Park
Bayview Village
Downsview
The Danforth West, Ri

In [36]:
# view the nearby venues
nearby_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.806686,-79.194353,Harvey's,43.800020,-79.198307,Restaurant
1,Parkwoods,43.806686,-79.194353,Wendy's,43.802008,-79.198080,Fast Food Restaurant
2,Parkwoods,43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
3,Parkwoods,43.806686,-79.194353,RBC Royal Bank,43.798782,-79.197090,Bank
4,Parkwoods,43.806686,-79.194353,Caribbean Wave,43.798558,-79.195777,Caribbean Restaurant
...,...,...,...,...,...,...,...
4907,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.739416,-79.588437,Panorama Park,43.747021,-79.583497,Park
4908,"Mimico NW, The Queensway West, South of Bloor,...",43.706748,-79.594054,Tim Hortons,43.714657,-79.593716,Coffee Shop
4909,"Mimico NW, The Queensway West, South of Bloor,...",43.706748,-79.594054,Saand Rexdale,43.705072,-79.598725,Drugstore
4910,"Mimico NW, The Queensway West, South of Bloor,...",43.706748,-79.594054,Toronto Pearson International Airport Pet Park,43.704901,-79.604441,Dog Run


In [38]:
# check out how many unique categories are there
print('There are {} uniques categories.'.format(len(nearby_venues['Venue Category'].unique())))

There are 333 uniques categories.


In [110]:
# one hot encoding
toronto_onehot = pd.get_dummies(nearby_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = nearby_venues['Neighborhood'] 

# move neighborhood column to the first column
toronto_onehot = toronto_onehot[['Neighborhood'] + [col for col in toronto_onehot.columns if col != 'Neighborhood']]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Venue Category_Accessories Store,Venue Category_Afghan Restaurant,Venue Category_African Restaurant,Venue Category_Airport,Venue Category_American Restaurant,Venue Category_Amphitheater,Venue Category_Animal Shelter,Venue Category_Antique Shop,Venue Category_Aquarium,...,Venue Category_Video Store,Venue Category_Vietnamese Restaurant,Venue Category_Warehouse Store,Venue Category_Whisky Bar,Venue Category_Wine Bar,Venue Category_Wine Shop,Venue Category_Wings Joint,Venue Category_Women's Store,Venue Category_Yoga Studio,Venue Category_Zoo
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
toronto_onehot.shape

(4912, 333)

In [47]:
# group using the mean
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,Agincourt,0.01,0.000000,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.000000,0.00,0.000000,0.0,0.00,0.0,0.00,0.000000,0.000000,0.0
1,"Alderwood, Long Branch",0.00,0.000000,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.000000,0.00,0.000000,0.0,0.00,0.0,0.00,0.000000,0.000000,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.00,0.000000,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.032258,0.00,0.000000,0.0,0.00,0.0,0.00,0.000000,0.000000,0.0
3,Bayview Village,0.00,0.039216,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.000000,0.00,0.019608,0.0,0.00,0.0,0.00,0.019608,0.019608,0.0
4,"Bedford Park, Lawrence Manor East",0.00,0.000000,0.0,0.0,0.02,0.0,0.0,0.0,0.0,...,0.000000,0.00,0.000000,0.0,0.00,0.0,0.00,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,"Willowdale, Newtonbrook",0.00,0.000000,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.010000,0.00,0.000000,0.0,0.00,0.0,0.01,0.010000,0.020000,0.0
93,Woburn,0.00,0.000000,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.000000,0.01,0.000000,0.0,0.00,0.0,0.00,0.000000,0.010000,0.0
94,Woodbine Heights,0.00,0.000000,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.000000,0.00,0.000000,0.0,0.00,0.0,0.00,0.000000,0.000000,0.0
95,York Mills West,0.00,0.000000,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.000000,0.01,0.000000,0.0,0.00,0.0,0.00,0.000000,0.010000,0.0


In [45]:
toronto_grouped.shape

(97, 333)

In [48]:
# create a function to sort the venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [78]:
# create the new dataframe
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

    
print(neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted.head()

(97, 6)


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Agincourt,Café,Coffee Shop,Restaurant,Bar,Furniture / Home Store
1,"Alderwood, Long Branch",Pharmacy,Convenience Store,Park,Bank,Bakery
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Convenience Store,Diner,Chinese Restaurant
3,Bayview Village,Coffee Shop,Indian Restaurant,Grocery Store,Gym,Bank
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Café,Restaurant,Gastropub,Theater


# Part3: Clustering

In [79]:
# set number of clusters
kclusters = 8

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood',axis=1) #drop the neighborhood column

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[:] 

array([2, 1, 1, 1, 2, 3, 2, 2, 6, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 6, 2, 2,
       1, 2, 2, 1, 1, 1, 1, 0, 2, 0, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2,
       2, 6, 2, 2, 2, 6, 2, 5, 2, 1, 2, 1, 0, 2, 1, 2, 1, 2, 6, 1, 2, 7,
       2, 6, 1, 2, 4, 1, 6, 1, 2, 2, 2, 2, 1, 0, 2, 1, 1, 2, 1, 1, 1, 1,
       2, 2, 2, 2, 2, 2, 6, 2, 2], dtype=int32)

In [80]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged#.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M3A,North York,Parkwoods,M1B,43.806686,-79.194353,2.0,Coffee Shop,Trail,Restaurant,Fast Food Restaurant,Spa
1,M4A,North York,Victoria Village,M1C,43.784535,-79.160497,1.0,Breakfast Spot,Burger Joint,Playground,Park,Italian Restaurant
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",M1E,43.763573,-79.188711,1.0,Pizza Place,Bank,Fast Food Restaurant,Restaurant,Coffee Shop
3,M6A,North York,"Lawrence Manor, Lawrence Heights",M1G,43.770992,-79.216917,6.0,Park,Coffee Shop,Fast Food Restaurant,Indian Restaurant,Chinese Restaurant
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M1H,43.773136,-79.239476,6.0,Bakery,Coffee Shop,Indian Restaurant,Gas Station,Bank
...,...,...,...,...,...,...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",M9N,43.706876,-79.518188,1.0,Coffee Shop,Train Station,Middle Eastern Restaurant,Soccer Field,Breakfast Spot
99,M4Y,Downtown Toronto,Church and Wellesley,M9P,43.696319,-79.532242,1.0,Pizza Place,Gas Station,Golf Course,Intersection,Flea Market
100,M7Y,East Toronto,Business reply mail Processing Centre,M9R,43.688905,-79.554724,6.0,Pharmacy,Pizza Place,Intersection,Business Service,Supermarket
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",M9V,43.739416,-79.588437,1.0,Pizza Place,Grocery Store,Construction & Landscaping,Gym Pool,Fast Food Restaurant


In [81]:
# check do we have a certain row with null values
print(toronto_merged.isnull().any())
print(toronto_merged['Cluster Labels'].dtypes)
# drop rows
toronto_merged.dropna(subset=["Cluster Labels"], axis=0, inplace = True)

# convert the Cluster Lables to an integer
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype("int")

# check the rows have been droppped
toronto_merged

PostalCode               False
Borough                  False
Neighborhood             False
Postal Code              False
Latitude                 False
Longitude                False
Cluster Labels            True
1st Most Common Venue     True
2nd Most Common Venue     True
3rd Most Common Venue     True
4th Most Common Venue     True
5th Most Common Venue     True
dtype: bool
float64


Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M3A,North York,Parkwoods,M1B,43.806686,-79.194353,2,Coffee Shop,Trail,Restaurant,Fast Food Restaurant,Spa
1,M4A,North York,Victoria Village,M1C,43.784535,-79.160497,1,Breakfast Spot,Burger Joint,Playground,Park,Italian Restaurant
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",M1E,43.763573,-79.188711,1,Pizza Place,Bank,Fast Food Restaurant,Restaurant,Coffee Shop
3,M6A,North York,"Lawrence Manor, Lawrence Heights",M1G,43.770992,-79.216917,6,Park,Coffee Shop,Fast Food Restaurant,Indian Restaurant,Chinese Restaurant
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M1H,43.773136,-79.239476,6,Bakery,Coffee Shop,Indian Restaurant,Gas Station,Bank
...,...,...,...,...,...,...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",M9N,43.706876,-79.518188,1,Coffee Shop,Train Station,Middle Eastern Restaurant,Soccer Field,Breakfast Spot
99,M4Y,Downtown Toronto,Church and Wellesley,M9P,43.696319,-79.532242,1,Pizza Place,Gas Station,Golf Course,Intersection,Flea Market
100,M7Y,East Toronto,Business reply mail Processing Centre,M9R,43.688905,-79.554724,6,Pharmacy,Pizza Place,Intersection,Business Service,Supermarket
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",M9V,43.739416,-79.588437,1,Pizza Place,Grocery Store,Construction & Landscaping,Gym Pool,Fast Food Restaurant


# Part4: Visualization

In [98]:
# get the geographical coordinates of Toronto
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  after removing the cwd from sys.path.


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [99]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters