# Task 1
## Install beautifulsoup4 and import libraries

In [2]:
#pip install beautifulsoup4    #Comment out after installation successful
import urllib.request, urllib.parse, urllib.error
import ssl
from bs4 import BeautifulSoup

In [3]:
import numpy as np
import pandas as pd

## Read data from Web into a dataframe

In [4]:
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')

table = soup.find_all('table')[0] # Grab the first table

df = pd.DataFrame(columns=range(0,3)) 

row_marker = 0
for row in table.find_all('tr'):
    column_marker = 0
    columns = row.find_all('td')
    for column in columns:
        df.loc[row_marker,column_marker] = column.get_text()
        column_marker += 1
    row_marker+=1
print("size of dataframe:", df.shape)
df.head(3)

size of dataframe: (287, 3)


Unnamed: 0,0,1,2
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n


## Clean the data
1. Rename column head
1. Replace "\n" 
1. Filter out Borough = "Not assigned"
1. Replace "Not assigned" neighborhood with name of borough

In [5]:
df.columns = ['Postal Code', 'Borough', 'Nb']                       # Rename column head
df.replace(to_replace='\\n', value="", regex=True, inplace=True)   # Replace \n columns

df=df[df['Borough']!="Not assigned"].dropna()                      # Filter out Borough = "Not assigned"

for r,j in df[df['Nb']=="Not assigned"].iterrows():                # Loop through rows with "Not assigned" neighborhood 
    df.loc[r]['Nb'] = df.loc[r]['Borough']                         # and replace with name of borough

print("size of dataframe:", df.shape)
df.head(3)

size of dataframe: (210, 3)


Unnamed: 0,Postal Code,Borough,Nb
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


## Group rows with same PostalCode and Borough

In [6]:
df['Neighborhood'] = df.groupby(['Postal Code', 'Borough']).transform(lambda x: ', '.join(sorted(x)))
df_gp = df.groupby(['Postal Code', 'Borough','Neighborhood']).count()
df_gp=df_gp.drop('Nb', axis=1).reset_index()
print("size of dataframe:", df_gp.shape)
df_gp.head()

size of dataframe: (103, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
df_gp.shape

(103, 3)

# Task 2
## Get geocode information from csv file

In [8]:
df_geocode = pd.read_csv('http://cocl.us/Geospatial_data')
print("size of dataframe:", df_geocode.shape)
df_geocode.head(3)

size of dataframe: (103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711


## Combine the Geocode with neighborhood data

In [9]:
df_withGeo = pd.merge(df_gp,df_geocode, on=['Postal Code'], how='left')

In [10]:
print("Missing Lat/Long data: ", df_withGeo[df_withGeo['Latitude']==np.nan].shape[0])
print("size of dataframe:", df_withGeo.shape)
df_withGeo.head()

Missing Lat/Long data:  0
size of dataframe: (103, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Task 3
## Limit data to boroughs that contain the word Toronto

In [11]:
df_Toronto = df_withGeo[df_withGeo['Borough'].str.contains('Toronto')]
print("size of dataframe:", df_Toronto.shape)
df_Toronto.head()

size of dataframe: (38, 5)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"Riverdale, The Danforth West",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


## Get coordinates of Toronto

In [12]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [13]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


## Create a map of Toronto

In [14]:
import folium # map rendering library

In [15]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_Toronto['Latitude'], df_Toronto['Longitude'], df_Toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Define Foursquare Credentials and Version

In [16]:
CLIENT_ID = 'I2GDHXDCPDKU5WXYUPJWM551N3JVWEFK0WOZFHYQAVRFEHW3' # your Foursquare ID
CLIENT_SECRET = 'WJV00LZPAEDAJ4OWDUHSEA5FEXBYN4YCHKGUFCCOKRKJC3Z4' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: I2GDHXDCPDKU5WXYUPJWM551N3JVWEFK0WOZFHYQAVRFEHW3
CLIENT_SECRET:WJV00LZPAEDAJ4OWDUHSEA5FEXBYN4YCHKGUFCCOKRKJC3Z4


## Explore Neighborhood in Toronto
Get 10 venues from Foursquare for each neighborhood

In [17]:
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [18]:
def getNearbyVenues(names, latitudes, longitudes, LIMIT=10, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
#        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Nb', 
                  'Nb Latitude', 
                  'Nb Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [19]:
toronto_venues = getNearbyVenues(names=df_Toronto['Neighborhood'],
                                   latitudes=df_Toronto['Latitude'],
                                   longitudes=df_Toronto['Longitude']
                                  )
print("{} venues Imported.".format(toronto_venues.shape[0]))

340 venues Imported.


In [20]:
print("size of dataframe:", toronto_venues.shape)
toronto_venues.head()

size of dataframe: (340, 7)


Unnamed: 0,Nb,Nb Latitude,Nb Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
4,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood


In [21]:
print('There are {} uniques categories and {} unique venues.'.format(
    len(toronto_venues['Venue Category'].unique()),
    len(toronto_venues['Venue'].unique())))

There are 116 uniques categories and 303 unique venues.


## Analyze Each Neighborhood

In [22]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
toronto_onehot['Nb'] = toronto_venues['Nb'] 
# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

print("size of dataframe:", toronto_onehot.shape)
toronto_onehot.head()

size of dataframe: (340, 117)


Unnamed: 0,Nb,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,Arts & Crafts Store,Asian Restaurant,Auto Workshop,...,Swim School,Tea Room,Tennis Court,Thai Restaurant,Theater,Theme Restaurant,Trail,Vegetarian / Vegan Restaurant,Wine Bar,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
toronto_grouped = toronto_onehot.groupby('Nb').mean().reset_index()
print("size of dataframe:", toronto_grouped.shape)
toronto_grouped.head()

size of dataframe: (38, 117)


Unnamed: 0,Nb,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,Arts & Crafts Store,Asian Restaurant,Auto Workshop,...,Swim School,Tea Room,Tennis Court,Thai Restaurant,Theater,Theme Restaurant,Trail,Vegetarian / Vegan Restaurant,Wine Bar,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0
1,"Bathurst Quay, CN Tower, Harbourfront West, Is...",0.1,0.1,0.1,0.2,0.1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.1,0.0,0.0
3,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Find out the Top 5 most popular venue in each Neighborhood

In [24]:
num_top_venues=5

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [25]:
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Nb']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Popular Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Popular Venue'.format(ind+1))

# create a new dataframe
venues_sorted = pd.DataFrame(columns=columns)
venues_sorted['Nb'] = toronto_grouped['Nb']

for ind in np.arange(toronto_grouped.shape[0]):
    venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

print("size of dataframe:", venues_sorted.shape)
venues_sorted.head()

size of dataframe: (38, 6)


Unnamed: 0,Nb,1st Popular Venue,2nd Popular Venue,3rd Popular Venue,4th Popular Venue,5th Popular Venue
0,"Adelaide, King, Richmond",Steakhouse,Coffee Shop,Café,Opera House,Speakeasy
1,"Bathurst Quay, CN Tower, Harbourfront West, Is...",Airport Lounge,Airport,Harbor / Marina,Plane,Bar
2,Berczy Park,Liquor Store,Vegetarian / Vegan Restaurant,Park,Concert Hall,Beer Bar
3,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Pet Store,Bakery,Gym,Breakfast Spot
4,Business Reply Mail Processing Centre 969 Eastern,Fast Food Restaurant,Farmers Market,Comic Shop,Brewery,Restaurant


## Cluster Neighborhoods

In [26]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

In [27]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Nb', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 3, 1, 1, 3, 1, 1, 1, 1, 1], dtype=int32)

In [28]:
toronto_merged = df_Toronto.rename(columns={'Neighborhood': 'Nb'})

# add clustering labels
venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(venues_sorted.set_index('Nb'), on='Nb')

toronto_merged.head()

Unnamed: 0,Postal Code,Borough,Nb,Latitude,Longitude,Cluster Labels,1st Popular Venue,2nd Popular Venue,3rd Popular Venue,4th Popular Venue,5th Popular Venue
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,3,Other Great Outdoors,Trail,Neighborhood,Health Food Store,Pub
41,M4K,East Toronto,"Riverdale, The Danforth West",43.679557,-79.352188,3,Greek Restaurant,Ice Cream Shop,Brewery,Italian Restaurant,Cosmetics Shop
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,3,Fish & Chips Shop,Park,Burger Joint,Gym,Pet Store
43,M4M,East Toronto,Studio District,43.659526,-79.340923,1,Bookstore,Bakery,Coffee Shop,Café,Sandwich Place
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2,Bus Line,Swim School,Park,Yoga Studio,Fish & Chips Shop


In [29]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [30]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Nb'],
                                  toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Clusters

In [31]:
# Cluster 1
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Popular Venue,2nd Popular Venue,3rd Popular Venue,4th Popular Venue,5th Popular Venue
63,Central Toronto,0,Health & Beauty Service,Garden,Home Service,Yoga Studio,Fish Market


In [32]:
# Cluster 2
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Popular Venue,2nd Popular Venue,3rd Popular Venue,4th Popular Venue,5th Popular Venue
43,East Toronto,1,Bookstore,Bakery,Coffee Shop,Café,Sandwich Place
45,Central Toronto,1,Park,Clothing Store,Sandwich Place,Food & Drink Shop,Hotel
46,Central Toronto,1,Yoga Studio,Sporting Goods Shop,Chinese Restaurant,Dessert Shop,Mexican Restaurant
47,Central Toronto,1,Dessert Shop,Coffee Shop,Park,Italian Restaurant,Pizza Place
49,Central Toronto,1,Coffee Shop,Supermarket,Restaurant,Pub,Fried Chicken Joint
51,Downtown Toronto,1,Café,Jewelry Store,Bakery,General Entertainment,Japanese Restaurant
52,Downtown Toronto,1,Gastropub,Dance Studio,Salon / Barbershop,Restaurant,Ramen Restaurant
53,Downtown Toronto,1,Breakfast Spot,Bakery,Spa,Restaurant,Pub
54,Downtown Toronto,1,Music Venue,Pizza Place,Ramen Restaurant,Burrito Place,Plaza
55,Downtown Toronto,1,Gastropub,Restaurant,Coffee Shop,Gym,Cosmetics Shop


In [33]:
# Cluster 3
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Popular Venue,2nd Popular Venue,3rd Popular Venue,4th Popular Venue,5th Popular Venue
44,Central Toronto,2,Bus Line,Swim School,Park,Yoga Studio,Fish & Chips Shop


In [34]:
# Cluster 4
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Popular Venue,2nd Popular Venue,3rd Popular Venue,4th Popular Venue,5th Popular Venue
37,East Toronto,3,Other Great Outdoors,Trail,Neighborhood,Health Food Store,Pub
41,East Toronto,3,Greek Restaurant,Ice Cream Shop,Brewery,Italian Restaurant,Cosmetics Shop
42,East Toronto,3,Fish & Chips Shop,Park,Burger Joint,Gym,Pet Store
59,Downtown Toronto,3,Park,Supermarket,Performing Arts Venue,Plaza,Lake
64,Central Toronto,3,Bus Line,Trail,Sushi Restaurant,Jewelry Store,Cosmetics Shop
68,Downtown Toronto,3,Airport Lounge,Airport,Harbor / Marina,Plane,Bar
77,West Toronto,3,Greek Restaurant,Cocktail Bar,Bar,Ice Cream Shop,Brewery
87,East Toronto,3,Fast Food Restaurant,Farmers Market,Comic Shop,Brewery,Restaurant


In [35]:
# Cluster 5
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Popular Venue,2nd Popular Venue,3rd Popular Venue,4th Popular Venue,5th Popular Venue
48,Central Toronto,4,Playground,Tennis Court,Park,Yoga Studio,Fast Food Restaurant
50,Downtown Toronto,4,Park,Trail,Playground,Yoga Studio,Fast Food Restaurant
