# Capstone Project Notebook

This notebook will be used for the Capstone Project course.

---

In [229]:
!pip install folium

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


In [230]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

### Scraping the Neighbourhood data from wikipedia

In [231]:
wiki = requests.get("https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=1012118802") 

soup = BeautifulSoup(wiki.content,'lxml') 

### Reading the data from the table into a dataframe

In [232]:
table = soup.find_all('table')[0]  

data = pd.read_html(str(table)) 

neighbourhood=pd.DataFrame(data[0])

### Dropping all rows where the Borough is 'Not assigned'

In [233]:
neighbourhood = neighbourhood[neighbourhood['Borough'] != 'Not assigned']

### Replacing rows where Neighbourhood is 'Not assigned' with the name of the Borough

In [234]:
neighbourhood.loc[neighbourhood["Neighbourhood"]=='Not assigned', "Neighbourhood"] = neighbourhood["Borough"]

In [235]:
neighbourhood.shape

(103, 3)

### The first DataFrame

In [236]:
neighbourhood

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


### Importing the Geospatial_Coordinates csv file

In [237]:
import os, types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.

if os.environ.get('RUNTIME_ENV_LOCATION_TYPE') == 'external':
    endpoint_c8133f94b0404133a96a1b965250e43a = 'https://s3-api.us-geo.objectstorage.softlayer.net'
else:
    endpoint_c8133f94b0404133a96a1b965250e43a = 'https://s3-api.us-geo.objectstorage.service.networklayer.com'

client_c8133f94b0404133a96a1b965250e43a = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='AF9i48uzMISpDC7UAA5SzU39NOL-BuSCy-7l2v2L9iRl',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url=endpoint_c8133f94b0404133a96a1b965250e43a)

body = client_c8133f94b0404133a96a1b965250e43a.get_object(Bucket='dataanalysiswithpython-donotdelete-pr-fng9ingtkouq7b',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

coordinates = pd.read_csv(body)

### Merging the Neighbourhood dataframe with the coordinates dataframe

In [238]:
geo_neigh = pd.merge(neighbourhood, coordinates, how="left", on=["Postal Code"])

### The second DataFrame

In [239]:
geo_neigh

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


### Creating a map of North York:

(I chose to focus on the borough of North York)

In [240]:
northyork_data = geo_neigh[geo_neigh['Borough'] == 'North York'].reset_index(drop=True)

address = 'North York, Ontario'

geolocator = Nominatim(user_agent="c_p_@live.com")
location = geolocator.geocode(address)
northyork_latitude = location.latitude
northyork_longitude = location.longitude

map_northyork = folium.Map(location=[northyork_latitude, northyork_longitude], zoom_start=11)

for lat, lng, label in zip(northyork_data['Latitude'], northyork_data['Longitude'], northyork_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_northyork)  
    
map_northyork

### Create a function to get all the venues of each neighbourhood

In [241]:
CLIENT_ID = 'X4XPLSQEZRMXHBPPV35XZNTYQZHSWFIKC4TTANTYS4WZIWXL'
CLIENT_SECRET = 'AKFBTVRQTOWJ20LRD1IRTAADWGCLGFBL31X2T5JY0WR3GPNP'
VERSION = '20180605'
LIMIT = 100 

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [242]:
northyork_venues = getNearbyVenues(northyork_data["Neighbourhood"], northyork_data["Latitude"], northyork_data["Longitude"])    

### One hot encoding so we can analyze the neighbourhoods

In [243]:
northyork_onehot = pd.get_dummies(northyork_venues[['Venue Category']], prefix="", prefix_sep="")

northyork_onehot['Neighbourhood'] = northyork_venues['Neighbourhood'] 

fixed_columns = [northyork_onehot.columns[-1]] + list(northyork_onehot.columns[:-1])
northyork_onehot = northyork_onehot[fixed_columns]

Group rows by neighbourhood and take the mean of the frequency of occurrence of each category

In [244]:
northyork_grouped = northyork_onehot.groupby('Neighbourhood').mean().reset_index()

Function to sort the venues in descending order

In [245]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Create a dataframe that shows the top 10 venues per neighbourhood

In [246]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighbourhood_venues_sorted = pd.DataFrame(columns=columns)
neighbourhood_venues_sorted['Neighbourhood'] = northyork_grouped['Neighbourhood']

for ind in np.arange(northyork_grouped.shape[0]):
    neighbourhood_venues_sorted.iloc[ind, 1:] = return_most_common_venues(northyork_grouped.iloc[ind, :], num_top_venues)

### Clustering

Run k means with k=6

In [247]:
kclusters = 6

northyork_grouped_clustering = northyork_grouped.drop('Neighbourhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(northyork_grouped_clustering)

kmeans.labels_[0:10] 

array([3, 0, 3, 3, 3, 3, 2, 3, 1, 5], dtype=int32)

Create a dataframe that shows the top 10 venues and the cluster for each neighbourhood

In [248]:
neighbourhood_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

northyork_merged = northyork_data

northyork_merged = northyork_merged.join(neighbourhood_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

In [249]:
northyork_merged = northyork_merged.dropna()

### Mapping out the clusters

In [250]:
map_clusters = folium.Map(location=[northyork_latitude, northyork_longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(northyork_merged['Latitude'], northyork_merged['Longitude'], northyork_merged['Neighbourhood'], northyork_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Cluster 1

In [251]:
northyork_merged.loc[northyork_merged['Cluster Labels'] == 0, northyork_merged.columns[[1] + list(range(5, northyork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,North York,0.0,Japanese Restaurant,Café,Bank,Chinese Restaurant,Women's Store,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store


### Cluster 2

In [252]:
northyork_merged.loc[northyork_merged['Cluster Labels'] == 1, northyork_merged.columns[[1] + list(range(5, northyork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
15,North York,1.0,Gym,Pizza Place,Home Service,Women's Store,Dessert Shop,Chocolate Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping


### Cluster 3

In [253]:
northyork_merged.loc[northyork_merged['Cluster Labels'] == 2, northyork_merged.columns[[1] + list(range(5, northyork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,2.0,Park,Food & Drink Shop,Fast Food Restaurant,Women's Store,Department Store,Chocolate Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping
4,North York,2.0,Italian Restaurant,Park,Bakery,Japanese Restaurant,Women's Store,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store
14,North York,2.0,Park,Construction & Landscaping,Bakery,Basketball Court,Women's Store,Dim Sum Restaurant,Clothing Store,Coffee Shop,Comfort Food Restaurant,Convenience Store
22,North York,2.0,Park,Convenience Store,Women's Store,Dessert Shop,Chocolate Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Cosmetics Shop


### Cluster 4

In [254]:
northyork_merged.loc[northyork_merged['Cluster Labels'] == 3, northyork_merged.columns[[1] + list(range(5, northyork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,North York,3.0,Coffee Shop,Pizza Place,Hockey Arena,Portuguese Restaurant,Women's Store,Department Store,Chocolate Shop,Clothing Store,Comfort Food Restaurant,Construction & Landscaping
2,North York,3.0,Clothing Store,Gift Shop,Miscellaneous Shop,Boutique,Coffee Shop,Furniture / Home Store,Vietnamese Restaurant,Accessories Store,Supermarket,Deli / Bodega
3,North York,3.0,Gym,Restaurant,Clothing Store,Coffee Shop,Art Gallery,Grocery Store,Chinese Restaurant,Caribbean Restaurant,Café,Japanese Restaurant
5,North York,3.0,Gym,Restaurant,Clothing Store,Coffee Shop,Art Gallery,Grocery Store,Chinese Restaurant,Caribbean Restaurant,Café,Japanese Restaurant
6,North York,3.0,Golf Course,Mediterranean Restaurant,Fast Food Restaurant,Dog Run,Pool,Women's Store,Deli / Bodega,Chocolate Shop,Clothing Store,Coffee Shop
7,North York,3.0,Coffee Shop,Bank,Gift Shop,Mobile Phone Shop,Bridal Shop,Pharmacy,Pizza Place,Ice Cream Shop,Deli / Bodega,Middle Eastern Restaurant
8,North York,3.0,Clothing Store,Coffee Shop,Fast Food Restaurant,Restaurant,Cosmetics Shop,Women's Store,Mobile Phone Shop,Japanese Restaurant,Juice Bar,Bank
9,North York,3.0,Caribbean Restaurant,Bar,Furniture / Home Store,Metro Station,Massage Studio,Coffee Shop,Miscellaneous Shop,Electronics Store,Dog Run,Fried Chicken Joint
11,North York,3.0,Grocery Store,Park,Discount Store,Gym / Fitness Center,Bus Stop,Business Service,Baseball Field,Shopping Mall,Bank,Liquor Store
13,North York,3.0,Grocery Store,Park,Discount Store,Gym / Fitness Center,Bus Stop,Business Service,Baseball Field,Shopping Mall,Bank,Liquor Store


### Cluster 5

In [255]:
northyork_merged.loc[northyork_merged['Cluster Labels'] == 4, northyork_merged.columns[[1] + list(range(5, northyork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
16,North York,4.0,Gym,Women's Store,Dim Sum Restaurant,Chocolate Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop


### Cluster 6

In [256]:
northyork_merged.loc[northyork_merged['Cluster Labels'] == 5, northyork_merged.columns[[1] + list(range(5, northyork_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,North York,5.0,Furniture / Home Store,Baseball Field,Women's Store,Dim Sum Restaurant,Clothing Store,Coffee Shop,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop
