## Part 1

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [None]:
!wget https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M -O postal_codes.html

In [2]:
with open("postal_codes.html", encoding="utf8") as html_doc:
    soup = BeautifulSoup(html_doc, 'html.parser')

First the table is read in from the html by Pandas:

In [3]:
raw_table = pd.read_html(str(soup.table), header=0)[0]
raw_table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Next, the Not Assigned values are handled:

In [4]:
raw_table = raw_table[raw_table["Borough"] != "Not assigned"]
raw_table["Neighbourhood"][raw_table["Neighbourhood"] == "Not assigned"] = raw_table[raw_table["Neighbourhood"] == "Not assigned"]["Borough"]
raw_table.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


Identical Postcodes are combined:

In [5]:
def f(x) :
    ret = x[["Postcode", "Borough"]].head(1)
    ret["Neighbourhood"] = ", ".join(x["Neighbourhood"])
    return ret
table = raw_table.groupby("Postcode").apply(f)
table.index = np.arange(len(table))
table.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [6]:
print(table.shape)

(103, 3)


## Part 2

The geospatial data is read in and joined to the data frame.

In [7]:
geodata = pd.read_csv("Geospatial_Coordinates.csv")
table = table.join(geodata.set_index('Postal Code'), on='Postcode')
table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part 3

In [8]:
import folium
from sklearn.cluster import KMeans
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors

In [9]:
map_toronto = folium.Map(location = [43.6532, -79.3832], zoom_start=10)
for index, row in table.iterrows():
    neighborhood = row["Neighbourhood"]
    borough = row["Borough"]
    lat = row["Latitude"]
    lng = row["Longitude"]
    label = '{}; {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
map_toronto

In [18]:
CLIENT_ID = 'client-id'
CLIENT_SECRET = 'client-secret'
VERSION = '20180605' # Foursquare API version
LIMIT = 100

In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [13]:
toronto_venues = getNearbyVenues(names=table['Neighbourhood'],
                                   latitudes=table['Latitude'],
                                   longitudes=table['Longitude']
                                  )
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
1,"Rouge, Malvern",43.806686,-79.194353,Caribbean Wave,43.798558,-79.195777,Caribbean Restaurant
2,"Rouge, Malvern",43.806686,-79.194353,Harvey's,43.800106,-79.198258,Fast Food Restaurant
3,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.802008,-79.19808,Fast Food Restaurant
4,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant


In [14]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighbourhood'] = toronto_venues['Neighborhood']
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,...,Vietnamese Restaurant,Warehouse Store,Waste Facility,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.020408,0.0,0.0,0.0,...,0.020408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 3, 3, 3, 4, 3, 4, 1, 1, 1, 4, 1, 1, 1, 1, 1, 4, 1, 3, 1, 1, 1,
       1, 3, 3, 3, 4, 1, 1, 1, 1, 3, 1, 1, 3, 1, 1, 3, 4, 3, 3, 1, 4, 3,
       1, 1, 1, 3, 3, 1, 1, 1, 1, 4, 4, 4, 4, 4, 3, 4, 3, 1, 3, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 3, 1, 4, 1, 4, 1, 3, 1, 1, 3, 0, 1, 1, 1, 1, 1,
       1, 1, 3, 1, 3, 4, 3, 3, 1, 3, 3, 3, 4, 4])

In [16]:
table["Label"] = 0
for nhood, label in zip(toronto_grouped["Neighbourhood"],kmeans.labels_) :
    table.loc[table["Neighbourhood"] == nhood,"Label"] = label
table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Label
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,3
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,4
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,3
3,M1G,Scarborough,Woburn,43.770992,-79.216917,3
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,3


In [17]:
map_toronto = folium.Map(location = [43.6532, -79.3832], zoom_start=10)
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
for index, row in table.iterrows():
    neighborhood = row["Neighbourhood"]
    borough = row["Borough"]
    lat = row["Latitude"]
    lng = row["Longitude"]
    label = '{}; {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color=rainbow[row["Label"]],
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
map_toronto