# Analysing Location Data in Toronto to Identify a Suitable Place to Open a New Bakery
## Preparing the Dataframe

In [10]:
# !pip install pandas
# !pip install lxml
# !pip install folium
import pandas as pd
import numpy as np
import folium
import requests
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [11]:
dt = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [12]:
df = dt[0]
df = df.set_index('Postal Code')

In [13]:
# removing rows with Borough = 'Not assigned'
df.drop(df[df.Borough == 'Not assigned'].index, inplace = True)

In [14]:
# dropping the created index
df.reset_index(inplace=True)
print(df.shape)
df.head()

(103, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Addind Latitude, Longitude to the Dataframe

In [15]:
# expanding comma seperated neighbourhoods into new rows
df2 = df.copy()
df2 = df2.set_index('Postal Code').Neighbourhood.str.split(',', expand=True).stack().reset_index('Postal Code')
df2.rename(columns = {0: 'Neighbourhood'}, inplace=True)
df2 = df2.set_index('Postal Code')
df2.head()

df3 = df.copy()
df3 = df3.set_index('Postal Code')
df3 = df3.drop('Neighbourhood', axis=1)

df = df2.merge(df3, left_index=True, right_index=True, how='inner')

print(df.shape)
df.head()

(217, 2)


Unnamed: 0_level_0,Neighbourhood,Borough
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Malvern,Scarborough
M1B,Rouge,Scarborough
M1C,Rouge Hill,Scarborough
M1C,Port Union,Scarborough
M1C,Highland Creek,Scarborough


In [16]:
# reading the csv with geo coords
coords = pd.read_csv('Geospatial_Coordinates.csv', index_col='Postal Code')

In [17]:
# merging two data frames based on the index
df = df.merge(coords, left_index=True, right_index=True, how='inner')
df.reset_index(inplace=True)
df.head()

Unnamed: 0,Postal Code,Neighbourhood,Borough,Latitude,Longitude
0,M1B,Malvern,Scarborough,43.806686,-79.194353
1,M1B,Rouge,Scarborough,43.806686,-79.194353
2,M1C,Rouge Hill,Scarborough,43.784535,-79.160497
3,M1C,Port Union,Scarborough,43.784535,-79.160497
4,M1C,Highland Creek,Scarborough,43.784535,-79.160497


### Exploring and Clustering the neighborhoods in Canada

In [18]:
latitude = 43.651070
longitude = -79.347015
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [19]:
# foursquare credentials
CLIENT_ID = '5TMWLT2ZWHACMKTUAJKG1OZEMMZD5NIL2TXTOCPX0F0CVG4X' # your Foursquare ID
CLIENT_SECRET = '3RPFHW5K3MGBMJXZV5EFXLZ5NMQPPFY5NQPENBTKHFMIK55H' # your Foursquare Secret
VERSION = '20210205' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 5TMWLT2ZWHACMKTUAJKG1OZEMMZD5NIL2TXTOCPX0F0CVG4X
CLIENT_SECRET:3RPFHW5K3MGBMJXZV5EFXLZ5NMQPPFY5NQPENBTKHFMIK55H


In [20]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
#         print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [21]:
toronto_venues = getNearbyVenues(names=df['Neighbourhood'], latitudes=df['Latitude'], longitudes=df['Longitude'])
print(toronto_venues.shape)
toronto_venues.head()

(4259, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Malvern,43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,Rouge,43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
2,Rouge Hill,43.784535,-79.160497,Great Shine Window Cleaning,43.783145,-79.157431,Home Service
3,Rouge Hill,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
4,Port Union,43.784535,-79.160497,Great Shine Window Cleaning,43.783145,-79.157431,Home Service


In [22]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 270 uniques categories.


In [23]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,American Restaurant,Antique Shop,...,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

#### Let's print each neighborhood along with the top 10 most common venues

In [25]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [26]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Café,Restaurant,Bakery,Gym,Clothing Store,Thai Restaurant,Deli / Bodega,Hotel,Burrito Place
1,Agincourt North,Playground,Park,Intersection,Women's Store,Distribution Center,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store
2,Albion Gardens,Grocery Store,Pharmacy,Fried Chicken Joint,Pizza Place,Sandwich Place,Beer Store,Fast Food Restaurant,Liquor Store,General Entertainment,Cuban Restaurant
3,Bathurst Quay,Airport Service,Airport Lounge,Coffee Shop,Rental Car Location,Sculpture Garden,Boat or Ferry,Bar,Airport Gate,Boutique,Airport
4,Beaumond Heights,Grocery Store,Pharmacy,Fried Chicken Joint,Pizza Place,Sandwich Place,Beer Store,Fast Food Restaurant,Liquor Store,General Entertainment,Cuban Restaurant


### Cluster Neighborhoods


In [27]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 2, 1, 1, 1, 1, 2, 1, 1, 1], dtype=int32)

In [28]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = df.copy()

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')
toronto_merged.dropna(inplace=True)
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int)

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Neighbourhood,Borough,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Malvern,Scarborough,43.806686,-79.194353,4,Fast Food Restaurant,Women's Store,Dog Run,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Doner Restaurant
1,M1B,Rouge,Scarborough,43.806686,-79.194353,4,Fast Food Restaurant,Women's Store,Dog Run,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Doner Restaurant
2,M1C,Rouge Hill,Scarborough,43.784535,-79.160497,1,Bar,Home Service,Women's Store,Deli / Bodega,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run
3,M1C,Port Union,Scarborough,43.784535,-79.160497,1,Bar,Home Service,Women's Store,Deli / Bodega,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run
4,M1C,Highland Creek,Scarborough,43.784535,-79.160497,1,Bar,Home Service,Women's Store,Deli / Bodega,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Distribution Center,Dog Run


In [29]:
# showing clusters on the map

# create map
latitude = 43.651070
longitude = -79.347015
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Examining Clusters for identifying what are the popular places in each cluster

#### Cluster 1

In [124]:
cs1 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(6, toronto_merged.shape[1]))]]
mask = np.column_stack([cs1[col].str.contains('Bakery', na=False) for col in cs1])
bkNeighbourhoods = list(cs1.loc[mask.any(axis=1)].Neighbourhood)

#### Cluster 2

In [125]:
cs2 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(6, toronto_merged.shape[1]))]]
mask = np.column_stack([cs2[col].str.contains('Bakery', na=False) for col in cs2])
bkNeighbourhoods.extend(list(cs2.loc[mask.any(axis=1)].Neighbourhood))

#### Cluster 3

In [126]:
cs3 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(6, toronto_merged.shape[1]))]]
mask = np.column_stack([cs3[col].str.contains('Bakery', na=False) for col in cs3])
bkNeighbourhoods.extend(list(cs3.loc[mask.any(axis=1)].Neighbourhood))

#### Cluster 4

In [127]:
cs4 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(6, toronto_merged.shape[1]))]]
mask = np.column_stack([cs4[col].str.contains('Bakery', na=False) for col in cs4])
bkNeighbourhoods.extend(list(cs4.loc[mask.any(axis=1)].Neighbourhood))

#### Cluster 5

In [128]:
cs5 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(6, toronto_merged.shape[1]))]]
mask = np.column_stack([cs5[col].str.contains('Bakery', na=False) for col in cs5])
bkNeighbourhoods.extend(list(cs5.loc[mask.any(axis=1)].Neighbourhood))

### Following are the neighbourhoods where there are bakeries within top 10 most common venues

#### Therefore, when opening a new Bakery, its advised to locate a place excluding the following neighbourhoods

In [132]:
bkNeighbourhoods = pd.DataFrame(bkNeighbourhoods)
bkNeighbourhoods.rename(columns = {0: 'Neighbourhood'}, inplace=True)
bkNeighbourhoods

Unnamed: 0,Neighbourhood
0,Cedarbrae
1,Golden Mile
2,Clairlea
3,Oakridge
4,Wexford
5,Maryvale
6,Fairview
7,Henry Farm
8,Oriole
9,Studio District
