# Scraping and Clustering

In [192]:
# Importing libraries

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import folium
import json
from sklearn.cluster import KMeans
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors

I used BeautifulSoup to scrape wikipedia for information to fill my data frame

In [193]:
# Scraping Wikipedia for Toronto Neighborhood Information
wikipedia = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(wikipedia, 'lxml')

# Creating Series
PostalCode = pd.Series([0], name = 'Postal Code')
p = 0
Borough = pd.Series([0], name = 'Borough')
b = 0
Neighborhood = pd.Series([0], name = 'Neighborhood')
n = 0

# Filling the series with information about Toronto Neighborhoods
vals = soup.find('tbody').find_all('td')
for i  in range(0, len(vals)):

    if i % 3 == 0:
        PostalCode[p] = vals[i].text
        p = p + 1
    elif i % 3 == 1:
        Borough[b] = vals[i].text
        b = b + 1
    elif i % 3 == 2:
        Neighborhood[n] = vals[i].text.strip()
        n = n + 1
        


Here I created the data frame and started to clean it

In [194]:
# Creating the dataframe from the three series
df = pd.concat([PostalCode, Borough, Neighborhood], axis = 1)
df.head()


# Removing the rows that have a not assigned value for Borough then resetting the index
df = df[df['Borough'] != 'Not assigned']
df = df.reset_index(drop = True)

I continued cleaning the data

In [195]:
# Finding the repeated postal codes
repeated_postal_codes = []
for i in range(0, len(df) - 1):
    if df.iloc[i]['Postal Code'] == df.iloc[i + 1]['Postal Code']:
        df.iloc[i + 1]['Neighborhood'] = df.iloc[i]['Neighborhood'] + ', ' + df.iloc[i + 1]['Neighborhood']
        repeated_postal_codes.append(i)
        
# Removing the repeated postal codes
for i in repeated_postal_codes:
    df.drop([i], inplace = True, axis = 0)

In [196]:
# Resetting the index
df = df.reset_index(drop = True)

# Replacing the not assigned values
for i in range(0, len(df) - 1):
    if df.iloc[i]['Neighborhood'] == 'Not assigned':
        df.iloc[i]['Neighborhood'] = df.iloc[i]['Borough']
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [197]:
df.shape

(103, 3)

# Getting the geospatial data and adding it to our data frame

In [198]:
# Reading the geopspatial data
source = 'https://cocl.us/Geospatial_data'
gdf = pd.read_csv(source)
gdf.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [199]:
# Merging the data frames
df = pd.merge(df, gdf, on= 'Postal Code')

In [200]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [201]:
df['Borough'].value_counts()

North York          24
Downtown Toronto    18
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East York            5
East Toronto         5
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

There are a lot of neighborhoods. Let's filter out the ones that don't include Toronto in their name

In [202]:
boroughs_to_delete = []
for i in range(0, len(df)):
    if 'Toronto' in df.iloc[i]['Borough']:
        pass
    else: boroughs_to_delete.append(i)
        
for i in boroughs_to_delete:
    df.drop([i], axis = 0, inplace = True)
    
# Resetting index
df = df.reset_index()

# Removing extra columns
# df = df.drop(['level_0','index'], axis = 1)


# Mapping Toronto's Neighborhood's

In [203]:
# Latitude and longitude of Toronto
lat = 43.6532
long = -79.3832

# Plotting a map of toronto
toronto_map = folium.Map(location=[lat, long], zoom_start=10)

# Adding markers representing the neighborhoods
for lat, long, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = f'{neighborhood}, {borough}'
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map



# Using Foursquare to Explore Christie, Downtown Toronto

In [204]:
# Setting up my foursquare credentials
client_id = 'IMYGI0FJBA40GAV1545T4FXXVIIODTDIYBTOEDNMHIUCBN4N'
client_secret = 'TG30YWCZ031V20URB4OVXWRLEJ0LBKXUDUNOGFJQOU53K1QT'

# Coordinates of Christie
latitude = df['Latitude'][6]
longitude = df['Longitude'][6]
radius = 1000
limit = 10

# Getting the top 10 venues in a 100m radius from Christie
url = f'https://api.foursquare.com/v2/venues/explore?&client_id={client_id}&client_secret={client_secret}&v=20180605&ll={latitude},{longitude}&radius={radius}&limit={limit}'
results = requests.get(url).json()


Cleaning the json file

In [205]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']


venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]


These are the top 10 venues near Christie

In [206]:
nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Fiesta Farms,Grocery Store,43.668471,-79.420485
1,Contra Cafe,Café,43.669107,-79.426105
2,Christie Pits Park,Park,43.664177,-79.420466
3,Vinny’s Panini,Italian Restaurant,43.670679,-79.426148
4,Starbucks,Coffee Shop,43.671585,-79.421366
5,Banjara Indian Cuisine,Indian Restaurant,43.662916,-79.421911
6,Actinolite,Restaurant,43.667858,-79.428054
7,Northwood,Cocktail Bar,43.662715,-79.422164
8,Hodo Kwaja 호도과자,Dessert Shop,43.66424,-79.415579
9,Buk Chang Dong Soon Tofu,Korean Restaurant,43.663842,-79.417093


# Clustering Toronto
Let's take a look at the city

In [207]:
toronto_map

Defining a function to get nearby venues for all the neighborhoods

In [208]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = f'https://api.foursquare.com/v2/venues/explore?&client_id={client_id}&client_secret={client_secret}&v=20180605&ll={lat},{lng}&radius={radius}&limit=100'
        results = requests.get(url).json()["response"]['groups'][0]['items']        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Running the function and placing its result into a dataframe

In [209]:
toronto_venues = pd.DataFrame(getNearbyVenues(df['Neighborhood'], df['Latitude'], df['Longitude'], radius = 500))

In [210]:
toronto_venues.shape

(1697, 7)

There are 1697 venues

In [211]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Harbourfront, Regent Park",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Harbourfront, Regent Park",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Harbourfront, Regent Park",43.65426,-79.360636,Toronto Cooper Koo Family Cherry St YMCA Centre,43.653191,-79.357947,Gym / Fitness Center
3,"Harbourfront, Regent Park",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Harbourfront, Regent Park",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


Let us prepare the data frame for clustering by encoding it

In [212]:
toronto_encoded = pd.get_dummies(toronto_venues['Venue Category'])

In [213]:
toronto_encoded.drop(['Neighborhood'], axis = 1, inplace = True)

In [214]:
# Adding the neighborhood column to the start of the dataframe
toronto_encoded.insert(loc = 0, column = 'Neighborhood', value = toronto_venues['Neighborhood'])

In [215]:
toronto_encoded.head()

Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Harbourfront, Regent Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Grouping the dataframe by Neighborhood and taking the mean of the frequency of each category

In [216]:
toronto_grouped = toronto_encoded.groupby(['Neighborhood']).mean().reset_index()
toronto_grouped


Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.066667,0.066667,0.066667,0.133333,0.133333,0.133333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011364,...,0.0,0.0,0.011364,0.0,0.011364,0.0,0.011364,0.0,0.0,0.011364
7,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.06,0.0,0.0,0.03,0.01,0.0,0.0,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.011494,0.011494,0.0,0.0,0.0,0.0,0.0,0.0,0.011494,...,0.0,0.0,0.0,0.011494,0.0,0.011494,0.0,0.011494,0.0,0.011494


We choose an arbitrary number of 5 clusters and we perform the clustering

In [217]:
k = 3
# We remove the Neighborhood column to perform clustering on the data set
toronto_clustered = toronto_grouped.drop(['Neighborhood'], axis = 1)
# We perform the clustering
kmeans = KMeans(n_clusters=k, random_state=0).fit(toronto_clustered)
len(kmeans.labels_)

38

We insert the cluster labels into the data frame and we merge the two dataframes together

In [218]:
toronto_grouped.insert(0,'Cluster Labels', kmeans.labels_)
toronto_merged = df
toronto_merged = toronto_merged.join(toronto_grouped.set_index('Neighborhood'), on='Neighborhood')

In [219]:
toronto_merged.drop(['index'], axis = 1, inplace = True)
toronto_merged

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020833
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.01,0.0,0.01,0.01,0.0,0.0,0.0
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.0
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,0.0,0.0,0.0,0.0,...,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.018182,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.011364,0.0,0.011364,0.0,0.011364,0.0,0.0,0.011364
6,M6G,Downtown Toronto,Christie,43.669542,-79.422564,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0
8,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",43.640816,-79.381752,0,0.0,0.0,0.0,0.0,...,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0


Now we finally map the clusters

In [220]:
lat = 43.6532
long = -79.3832
map_ = folium.Map(location=[lat, long], zoom_start=11)

x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# adding markers
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_)
       
map_