# 3.3. Webscrape Toronto borough information from Wikipedia page

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('df_final.csv')

In [6]:
df.reset_index()
df_toronto = df[df['Borough'].str.contains('Toronto')]
print("Required Number:", df_toronto.shape[0])

Required Number: 39


In [7]:
df_toronto.head(5)

Unnamed: 0.1,Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
9,9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [8]:
df_toronto.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39 entries, 2 to 100
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    39 non-null     int64  
 1   PostalCode    39 non-null     object 
 2   Borough       39 non-null     object 
 3   Neighborhood  39 non-null     object 
 4   Latitude      39 non-null     float64
 5   Longitude     39 non-null     float64
dtypes: float64(2), int64(1), object(3)
memory usage: 2.1+ KB


#### Import  libraries for map

In [9]:
import folium

### Creating map of Toronto using latitude and longitude values

In [12]:
lat_mean = df_toronto['Latitude'].mean()
long_mean = df_toronto['Longitude'].mean()
map_ = folium.Map(location=[lat_mean, long_mean], zoom_start=12)

for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_)  
    
map_

### Exploring venues in neighborhoods

In [14]:
venues = pd.read_json("https://raw.githubusercontent.com/ibm-developer-skills-network/yczvh-DataFilesForIBMProjects/master/segmenting_neighborhoods.json")    
venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                 'Venue', 
                 'Venue Latitude', 
                 'Venue Longitude', 
                 'Venue Category']
toronto_venues = venues
toronto_venues.head(5)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Guildwood, Morningside, West Hill",43.763573,-79.188711,RBC Royal Bank,43.76679,-79.191151,Bank
3,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Sail Sushi,43.765951,-79.191275,Restaurant


In [15]:
toronto_venues.describe()

Unnamed: 0,Neighborhood Latitude,Neighborhood Longitude,Venue Latitude,Venue Longitude
count,1337.0,1337.0,1337.0,1337.0
mean,43.683913,-79.394406,43.683911,-79.394235
std,0.045772,0.067328,0.045597,0.067369
min,43.602414,-79.615819,43.599152,-79.621765
25%,43.650571,-79.41975,43.649552,-79.419702
50%,43.66586,-79.38316,43.665303,-79.383893
75%,43.70906,-79.363452,43.710148,-79.361584
max,43.815252,-79.160497,43.815477,-79.163085


In [17]:
# the number of venues in each neighborhood
print(toronto_venues.groupby('Neighborhood').count().iloc[:,0])

Neighborhood
Agincourt                                           4
Alderwood, Long Branch                              8
Bathurst Manor, Wilson Heights, Downsview North    23
Bayview Village                                     4
Bedford Park, Lawrence Manor East                  22
                                                   ..
Willowdale West                                     5
Willowdale, Newtonbrook                             1
Woburn                                              4
Woodbine Heights                                    8
York Mills West                                     2
Name: Neighborhood Latitude, Length: 100, dtype: int64


In [19]:
toronto_venues.groupby('Neighborhood').count().head(50)

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Alderwood, Long Branch",8,8,8,8,8,8
"Bathurst Manor, Wilson Heights, Downsview North",23,23,23,23,23,23
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",22,22,22,22,22,22
Berczy Park,30,30,30,30,30,30
"Birch Cliff, Cliffside West",5,5,5,5,5,5
"Brockton, Parkdale Village, Exhibition Place",24,24,24,24,24,24
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",17,17,17,17,17,17
Caledonia-Fairbanks,4,4,4,4,4,4


In [20]:
print(f"The number of unique categories: {len(toronto_venues['Venue Category'].unique())}")

The number of unique categories: 241


### Top 10 most common venues

In [29]:
# Function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [40]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

In [41]:
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_venues_grouped['Neighborhood']

for ind in np.arange(toronto_venues_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_venues_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Latin American Restaurant,Lounge,Skating Rink,Breakfast Spot,Women's Store,Deli / Bodega,Drugstore,Donut Shop,Dog Run,Distribution Center
1,"Alderwood, Long Branch",Pizza Place,Skating Rink,Pharmacy,Pub,Sandwich Place,Coffee Shop,Gym,Gas Station,Coworking Space,Diner
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Diner,Bridal Shop,Supermarket,Restaurant,Sushi Restaurant,Ice Cream Shop,Middle Eastern Restaurant,Mobile Phone Shop
3,Bayview Village,Bank,Chinese Restaurant,Japanese Restaurant,Café,Women's Store,Deli / Bodega,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Sandwich Place,Italian Restaurant,Greek Restaurant,Thai Restaurant,Liquor Store,Juice Bar,Indian Restaurant,Restaurant,Sushi Restaurant


## Clustering K-Means

In [42]:
# import k-means from clustering stage
from sklearn.cluster import KMeans
kclusters = 5
toronto_venues_grouped_clustering = toronto_venues_grouped.drop('Neighborhood', 1)

In [43]:
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_venues_grouped_clustering)

In [45]:
kmeans

KMeans(n_clusters=5, random_state=0)

### Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood

In [32]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = df_toronto

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_merged.head()

Unnamed: 0.1,Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1,Coffee Shop,Park,Bakery,Breakfast Spot,Café,Greek Restaurant,Gym / Fitness Center,Pub,Performing Arts Venue,Mexican Restaurant
9,9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,1,Café,Theater,Clothing Store,Sporting Goods Shop,Hotel,Fast Food Restaurant,Steakhouse,Bakery,Ramen Restaurant,Music Venue
15,15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1,Gastropub,Café,Farmers Market,Coffee Shop,Thai Restaurant,Diner,Jazz Club,Japanese Restaurant,Italian Restaurant,Restaurant
19,19,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Pub,Trail,Health Food Store,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Women's Store,Cupcake Shop
20,20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,1,Cocktail Bar,Coffee Shop,Beer Bar,Farmers Market,Seafood Restaurant,Café,Breakfast Spot,Liquor Store,Bistro,Comfort Food Restaurant


### Visualize the clusters

In [36]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_ = folium.Map(location=[lat_mean, long_mean], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_)
       
map_

#### Above shows visualization of cluster and below is some summary statistics

In [38]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,-79.360636,1,Coffee Shop,Park,Bakery,Breakfast Spot,Café,Greek Restaurant,Gym / Fitness Center,Pub,Performing Arts Venue,Mexican Restaurant
9,M5B,-79.378937,1,Café,Theater,Clothing Store,Sporting Goods Shop,Hotel,Fast Food Restaurant,Steakhouse,Bakery,Ramen Restaurant,Music Venue
15,M5C,-79.375418,1,Gastropub,Café,Farmers Market,Coffee Shop,Thai Restaurant,Diner,Jazz Club,Japanese Restaurant,Italian Restaurant,Restaurant
19,M4E,-79.293031,1,Pub,Trail,Health Food Store,Dance Studio,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Women's Store,Cupcake Shop
20,M5E,-79.373306,1,Cocktail Bar,Coffee Shop,Beer Bar,Farmers Market,Seafood Restaurant,Café,Breakfast Spot,Liquor Store,Bistro,Comfort Food Restaurant
24,M5G,-79.387383,1,Coffee Shop,Italian Restaurant,Café,Yoga Studio,Thai Restaurant,Department Store,Sandwich Place,Spa,Japanese Restaurant,Bubble Tea Shop
25,M6G,-79.422564,1,Grocery Store,Café,Park,Baby Store,Candy Store,Coffee Shop,Italian Restaurant,Nightclub,Restaurant,Deli / Bodega
30,M5H,-79.384568,1,Coffee Shop,Café,Seafood Restaurant,Thai Restaurant,Smoke Shop,Lounge,Bakery,Steakhouse,Hotel,Fast Food Restaurant
31,M6H,-79.442259,1,Pharmacy,Bakery,Park,Pool,Brewery,Bar,Bank,Supermarket,Café,Middle Eastern Restaurant
35,M4J,-79.338106,1,Pizza Place,Park,Convenience Store,Intersection,Dance Studio,Eastern European Restaurant,Drugstore,Donut Shop,Dog Run,Distribution Center
