 ## Capstone Project Course. - Week 3

_This notebook will be mainly used for the capstone project._

In [1]:
#dependencies
import numpy as np
import pandas as pd

In [2]:
#load file
file_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = pd.read_html(file_url)[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Task 1

In [3]:
# Ignore cells with a borough that is Not assigned
df = df[~((df['Borough']=='Not assigned')&(df['Neighbourhood']=='Not assigned'))]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [4]:
#checking repeated postal codes
df.groupby('Postcode')['Neighbourhood'].count().sort_values(ascending=False)

Postcode
M9V    8
M8Y    8
M5V    7
M9B    5
M4V    5
      ..
M6C    1
M4A    1
M3N    1
M3M    1
M9W    1
Name: Neighbourhood, Length: 103, dtype: int64

In [5]:
#intermediate dataframe
unique_postalcode = df['Postcode'].unique()
lis = []
for pc in unique_postalcode:
    df_pc = df[df['Postcode']==pc]
    neigh = ', '.join(df_pc['Neighbourhood'].to_list())
    dictio = {'Postcode':pc, 'Neighbourhood':neigh}
    lis.append(dictio)
df_collapsed = pd.DataFrame(lis)
df_collapsed

Unnamed: 0,Postcode,Neighbourhood
0,M3A,Parkwoods
1,M4A,Victoria Village
2,M5A,Harbourfront
3,M6A,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park
...,...,...
98,M8X,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Church and Wellesley
100,M7Y,Business Reply Mail Processing Centre 969 Eastern
101,M8Y,"Humber Bay, King's Mill Park, Kingsway Park So..."


In [6]:
#transforming dataframe (union all)
df = df[['Postcode','Borough']].drop_duplicates().merge(df_collapsed, on='Postcode')
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So..."


In [7]:
#'Not assigned' replaced
for i in range(df.shape[0]):
    if df['Neighbourhood'].iloc[i] == 'Not assigned':
        df['Neighbourhood'].iloc[i] = df['Borough'].iloc[i]

#### Explanation

In this assignment I perform different tasks in a sequencial way according to the guide:

1. First of all, the data set was loaded from the given url.
2. After that, rows where columns 'neighbourhoods' and 'borough' have the value 'Not assigned' simultaneously were dropped.
3. Then a I checked the repeated postal codes in order to transform the data set in such way that it has no repeaded Postcode rows, and join the neighbourhood in the same rows according with de postal code.
4. At the end, I replaced 'Not assigned' in the Neighbourhood column for the value of the associated Borough column.


In the next cell, I use the **.shape** method to print the number of rows of my final dataframe.

In [8]:
df.shape

(103, 3)

### Task 2

In [9]:
#!pip install geocoder

In [10]:
#import geocoder # import geocoder

#postal_code= 'M5G'

# initialize your variable to None
#lat_lng_coords = None

# loop until you get the coordinates
#while(lat_lng_coords is None):
#    g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#    lat_lng_coords = g.latlng

#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

In [11]:
#load geocode file
file_name = 'Geospatial_Coordinates.csv'
df_geo = pd.read_csv(file_name)
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
#union all
df = df.merge(df_geo, left_on='Postcode', right_on='Postal Code', how='left')
df = df.drop(columns='Postal Code')
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


### Task 3

In [13]:
import folium
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests

#### Exploration of Neighbourhoods in Toronto

First of all, I  decide to work with only boroughs that contain the word "Toronto".

In [14]:
# toronto dataframe
df_toronto = df[(df.Borough=='Downtown Toronto') | 
                (df.Borough=='East Toronto') | 
                (df.Borough=='West Toronto') | 
                (df.Borough=='Central Toronto')].reset_index(drop=True)

df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
2,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [15]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

# map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

The geograpical coordinate of Toronto are 43.653963, -79.387207.


- **Top 100 venues that are in Neighbourhoods of Toronto within a radius of 500 meters**

In [16]:
CLIENT_ID = 'GVKYTQUCYQMYOH0YL3BH2EW3C0BKANB4YV4PRW1JKACLOTBS' # my Foursquare ID
CLIENT_SECRET = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX' # my Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 10 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

In [17]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [18]:
Toronto_venues = getNearbyVenues(names=df_toronto['Neighbourhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )
Toronto_venues.head()

Harbourfront
Queen's Park
Ryerson, Garden District
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Adelaide, King, Richmond
Dovercourt Village, Dufferin
Harbourfront East, Toronto Islands, Union Station
Little Portugal, Trinity
The Danforth West, Riverdale
Design Exchange, Toronto Dominion Centre
Brockton, Exhibition Place, Parkdale Village
The Beaches West, India Bazaar
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North, Forest Hill West
High Park, The Junction South
North Toronto West
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
Harbord, University of Toronto
Runnymede, Swansea
Moore Park, Summerhill East
Chinatown, Grange Park, Kensington Market
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Rosedale
Stn A PO Boxes 25 The Esplanade
Cabbagetown, St. James Town
Fir

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Harbourfront,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Harbourfront,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Harbourfront,43.65426,-79.360636,Cooper Koo Family YMCA,43.653191,-79.357947,Gym / Fitness Center
3,Harbourfront,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,Harbourfront,43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


In [19]:
#checking how many different venues categories were returned
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 119 uniques categories.


In [20]:
#checking how many venues were returned for each neighborhood
Toronto_venues.groupby('Neighbourhood').size().reset_index(name='Venue_count')

Unnamed: 0,Neighbourhood,Venue_count
0,"Adelaide, King, Richmond",10
1,Berczy Park,10
2,"Brockton, Exhibition Place, Parkdale Village",10
3,Business Reply Mail Processing Centre 969 Eastern,10
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",10
5,"Cabbagetown, St. James Town",10
6,Central Bay Street,10
7,"Chinatown, Grange Park, Kensington Market",10
8,Christie,10
9,Church and Wellesley,10


#### Exploration of each Neighbourhoods in Toronto

In [21]:
# one hot encoding
toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighbourhood'] = Toronto_venues['Neighbourhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()

#top 5 most common venues per neighbourhood 
num_top_venues = 5
for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
                           venue  freq
0                     Steakhouse   0.2
1                      Speakeasy   0.1
2                          Plaza   0.1
3                   Concert Hall   0.1
4  Vegetarian / Vegan Restaurant   0.1


----Berczy Park----
                           venue  freq
0              French Restaurant   0.1
1                   Concert Hall   0.1
2                   Liquor Store   0.1
3  Vegetarian / Vegan Restaurant   0.1
4                         Museum   0.1


----Brockton, Exhibition Place, Parkdale Village----
                venue  freq
0         Coffee Shop   0.2
1                 Gym   0.1
2      Breakfast Spot   0.1
3           Pet Store   0.1
4  Italian Restaurant   0.1


----Business Reply Mail Processing Centre 969 Eastern----
            venue  freq
0         Brewery   0.1
1   Garden Center   0.1
2      Comic Shop   0.1
3      Restaurant   0.1
4  Farmers Market   0.1


----CN Tower, Bathurst Quay, Island airport, Har

In [22]:
#Common Venue Dataframe
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 5
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Adelaide, King, Richmond",Steakhouse,Coffee Shop,Vegetarian / Vegan Restaurant,Opera House,Concert Hall
1,Berczy Park,Farmers Market,Tea Room,Concert Hall,Museum,Liquor Store
2,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Pet Store,Café,Breakfast Spot,Gym
3,Business Reply Mail Processing Centre 969 Eastern,Farmers Market,Skate Park,Comic Shop,Fast Food Restaurant,Burrito Place
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport,Bar,Harbor / Marina,Boutique


#### Cluster Neighborhoods

In [23]:
#clustering
from sklearn.cluster import KMeans
k = 5 # set number of clusters
X = toronto_grouped.drop('Neighbourhood', axis=1)
cluster_model = KMeans(n_clusters=k, random_state=0)
cluster_model.fit(X)

# checking cluster labels generated for each row in the dataframe
cluster_model.labels_

array([1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 2, 1,
       4, 2, 2, 1, 1, 1, 0, 3, 1, 1, 1, 2, 1, 1, 2, 2, 2])

In [24]:
# add clustering labels
toronto_clustering = neighborhoods_venues_sorted
toronto_clustering['cluster'] = cluster_model.labels_

# merging toronto_data with toronto_grouped in order to add latitude/longitude for each neighborhood
df_toronto  = df_toronto.merge(toronto_clustering, on='Neighbourhood')

In [25]:
# checking the last columns!
df_toronto 

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,cluster
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,Spa,Historic Site,Breakfast Spot,Park,Farmers Market,1
1,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,Coffee Shop,Yoga Studio,Italian Restaurant,Creperie,Portuguese Restaurant,1
2,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,Café,Pizza Place,Tea Room,Clothing Store,Music Venue,1
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,Coffee Shop,Japanese Restaurant,Gastropub,Gym,Cosmetics Shop,1
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,Trail,Pub,Health Food Store,Neighborhood,Yoga Studio,2
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,Farmers Market,Tea Room,Concert Hall,Museum,Liquor Store,2
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,Coffee Shop,Park,Modern European Restaurant,Gastropub,Bubble Tea Shop,1
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564,Café,Grocery Store,Restaurant,Italian Restaurant,Diner,1
8,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568,Steakhouse,Coffee Shop,Vegetarian / Vegan Restaurant,Opera House,Concert Hall,1
9,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259,Bakery,Grocery Store,Café,Middle Eastern Restaurant,Bar,1


In [26]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
import matplotlib.cm as cm
import matplotlib.colors as colors
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighbourhood'], df_toronto['cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters