## Importing all required packages.

In [None]:
# Importing required packages.
import pandas as pd
import numpy as np
#import pgeocode
from geopy.geocoders import Nominatim
#!conda install -c conda-forge folium=0.5.0 --yes
import folium
import requests # library to handle requests
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

## Making the DataFrame.


In [29]:
# Using the website link given and making a DataFrame our of it.
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs = pd.read_html(url)
Neighbourhood = pd.DataFrame(dfs[0])

#Displaying DataFrame
Neighbourhood.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


Let's change the string 'Not assigned' to __NaN__ so it can be removed from DataFrame.

In [None]:
#Chaning 'Not assigned' to NaN.
Neighbourhood.replace('Not assigned', np.NaN, inplace=True)

#Dropping NaN values.
Neighbourhood.dropna(inplace=True)

#Displaying final result.
Neighbourhood.head(10)

Let's see how many rows and columns we have in the end.

In [None]:
Neighbourhood.shape

Let's add Latitude and Longitude of neighbourhoods to our DataFrame.

In [None]:
Neighbourhood_coordinates = pd.read_csv("http://cocl.us/Geospatial_data")

#Merging two DataFrames.
merged_neighbour = merged_car = pd.merge(Neighbourhood_coordinates, Neighbourhood, on='Postal Code')
merged_neighbour = merged_neighbour[['Postal Code','Borough', 'Neighborhood', 'Latitude','Longitude']]

#Displaying final result.
merged_neighbour.head(12)

## Creating a map of Toronto

Get Toronto coordinates

In [None]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))


In [None]:
# create map using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(merged_neighbour['Latitude'], merged_neighbour['Longitude'], merged_neighbour['Borough']\
                                           , merged_neighbour['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Downtown Toronto - preprocessing data

I decided to explore Downtown Toronto. Firstly, I create the another dataframe for it.

In [None]:
downtown_df = merged_neighbour[merged_neighbour['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
downtown_df.head()

Get Downtown coordinates

In [None]:
address = 'Toronto, Downtown Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

Creating the map of Toronto Downtown

In [None]:
# create map of Downtown Toronto using latitude and longitude values
map_downtown = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(downtown_df['Latitude'], downtown_df['Longitude'],downtown_df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown)  
    
map_downtown

## Explore Neighborhoods in Toronto Downtown

I use the Foursquare API to explore the neighborhoods and segment them.

In [None]:
CLIENT_ID = 'EHJAOYBBN3OFIBFP2D3MSC3W0XEP0PQ410OIFUISK0P42JZY' 
CLIENT_SECRET = 'JSQOOEGOFCG435OITCYKUM5ICMQVZA3PPOKRZAUUCSYA4XXU' 
VERSION = '20180605' 

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

Function for getting venues

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Create the dataframe for Toronto Downtown using my function

In [None]:
downtown_venues = getNearbyVenues(names=downtown_df['Neighborhood'],
                                  latitudes=downtown_df['Latitude'],
                                  longitudes=downtown_df['Longitude']
                                  )

In [None]:
downtown_venues.groupby('Neighborhood').count()

Find out how many unique categories can be curated from all the returned venues

In [None]:
len(downtown_venues['Venue Category'].unique())

## Analyze Each Neighborhood

In [None]:
# one hot encoding
downtn_onehot = pd.get_dummies(downtown_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
downtn_onehot['Neighborhood'] = downtown_venues['Neighborhood'] 

downtn_onehot.head()

Examine the dataframe size.

In [None]:
downtn_onehot.shape

Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [None]:
downtn_grouped = downtn_onehot.groupby('Neighborhood').mean().reset_index()
downtn_grouped.head()

Print each neighborhood along with the top 5 most common venues

In [None]:
num_top_venues = 5

for hood in downtn_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = downtn_grouped[downtn_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

A function to sort the venues in descending order.

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Create the new dataframe and display the top 10 venues for each neighborhood.

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = downtn_grouped['Neighborhood']

for ind in np.arange(downtn_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtn_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

## Cluster Neighborhoods

Run k-means to cluster the neighborhood into 3 clusters.

In [None]:
# set number of clusters
kclusters = 3

downtn_grouped_clustering = downtn_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtn_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [None]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

downtn_merged = downtown_df
downtn_merged = downtn_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

downtn_merged.head() 

Finally, let's visualize the resulting clusters

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(downtn_merged['Latitude'], downtn_merged['Longitude'], downtn_merged['Neighborhood'], downtn_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Clusters

__Cluster 1 - residential areas__

In [None]:
downtn_merged.loc[downtn_merged['Cluster Labels'] == 0, downtn_merged.columns[[1] + list(range(3, downtn_merged.shape[1]))]]

__Cluster 2 - harbor__

In [None]:
downtn_merged.loc[downtn_merged['Cluster Labels'] == 1, downtn_merged.columns[[1] + list(range(3, downtn_merged.shape[1]))]]

__Cluster 3 - business center__

In [None]:
downtn_merged.loc[downtn_merged['Cluster Labels'] == 2, downtn_merged.columns[[1] + list(range(3, downtn_merged.shape[1]))]]