# Coursera IBM 

## Segmenting and Clustering Neighborhoods in Toronto


In [1]:
# python version is 3.8.6, 64-bit
import numpy as np
import pandas as pd

## Task 1
> Copy data table from Wikipedia to the clickboard, then use this Pandas method to get a Data Frame.

> The table from Wikipedia is small enough for just a clickboard, it takes 10 seconds to make Data Frame this way.

In [26]:
# just copy the table from Wikipedia into your clickboard and run this comand. No need for any scraping =)
df_raw = pd.read_clipboard()

In [27]:
df_raw.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M1B,Scarborough,"Malvern, Rouge"
2,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
3,M1E,Scarborough,"Guildwood, Morningside, West Hill"
4,M1G,Scarborough,Woburn


Exclude rows in column "Borough" wich has "Not assigned"

In [42]:
df_clean = df_raw[df_raw['Borough'] != 'Not assigned']

In [41]:
df_clean.shape

(103, 3)

## Task 2

> I picked "ArcGIS" provider for GeoCoder module.
https://geocoder.readthedocs.io/providers/ArcGIS.html

> Here we iterate over all the postal codes and append received coordinates into dictionary.




In [93]:
coordinates_dict = {}
# get list of Postal Codes
postal_codes = df_clean['Postal Code'].to_numpy()
for code in postal_codes :
    g = geocoder.arcgis(f'{code} Canada')
    coordinates_dict[code] = (g.json['lat'], g.json['lng'])

In [142]:
# turnd dictionary into Data Frame
fd_coordinates = pd.DataFrame.from_dict(coordinates_dict, orient='index', dtype='float')

In [143]:
# rename columns names
fd_coordinates.rename(columns={0:'Latitude', 1:'Longitude'}, inplace=True)

In [155]:
# unify index in both "df" for murging
d1 = df_clean.reset_index().drop(columns='index')
d2 = fd_coordinates.reset_index().drop(columns='index')
df_full = pd.concat([d1, d2], axis=1)

In [156]:
df_full.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.81139,-79.19662
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.78574,-79.15875
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76575,-79.1747
3,M1G,Scarborough,Woburn,43.76812,-79.21761
4,M1H,Scarborough,Cedarbrae,43.76944,-79.23892


## Task 3

### 3.1 Map all the Toronto neighbourhoods on Folium map

In [284]:
import folium
from pandas.io.json import json_normalize

Using the previously utilized "ArcGis" provider for Geocoder to obtain Toronto coordinates

In [218]:
# coordinates of Toronto to position the Folium Map
g = geocoder.arcgis('Toronto Canada')
toronto_lat = g.json['lat']
toronto_lng = g.json['lng']
f'Toronto is located at {toronto_lat}, {toronto_lng}'

'Toronto is located at 43.648690000000045, -79.38543999999996'

Folium takes all the neighbourhoods coordinates and map them on Toronto map.

In [202]:
# create folium map positioning at Toronto
toronto_map = folium.Map(location=[toronto_lat, toronto_lng], zoom_start=10)

In [217]:
# adding all the neighbourhoods on the map
for lat, lng, label in zip(df_full['Latitude'], df_full['Longitude'], df_full['Neighbourhood']):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        fill=True,
        color='blue',
        fill_color='blue',
        fill_opacity=0.6
        ).add_to(toronto_map)
toronto_map # GitHub doesn't show the map

> Apperantly GitHub doesn't load folium map, here is an picture of map

![Image of Yaktocat](https://jo5u7g.by.files.1drv.com/y4mWVym4oBRWElWVWGHk3IWYJmAYl7BxdGA61FlydQ9UJkhedMrvjKiEIUxk-BqMwbmdCFstcZkh08PCLOeB-Md5wXcYI9HXLRNW4HXl37ETAFMNpOxmQuCBp68Tc3zRavLrhpdGwHzxQlM9sR8eER1qLyuqGOjkIDcUQvUViMvkgqzG5O48c6jSigXGh2DI7ZtkjJbmjXMEQYL2Elxu4T-mw/map.PNG?psid=1)

### 3.2 Trending Venuse for each neighbourhood in Toronto

> Foursquare has an Python module, it is way more compact than doing "request" calls

In [230]:
import foursquare

# use your credentials
CLIENT_ID = 'LI3QEN010X1J2OFBIILGSMENYBFMGP2HYSGCZOYFPE41H5JF' # your Foursquare ID
CLIENT_SECRET = '2SKNIRRMXN3AQY5JIVEC5IB3GDAJLKUMN3PIW5PXAUOOCNAM' # your Foursquare Secret

In [None]:
# used FourSquare module to constrac a handler for FourSquare API
client = foursquare.Foursquare(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)

### Explore venues at the specific location
> client.venues.explore(params={'ll': f'{lat},{lng}', 'section': 'trending', 'limit': '5', 'sortByPopularity': '1'})

> "ll" takes Latitude and Longitude from Neighbourhood in the Toronto
> "section" : "trending" | give us trending venues at the given location
> 'limit': '5' | we get only 5 venues
> 'sortByPopularity': '1' | we get only 5 top trending venues

In [None]:
# function that extracts the category of the venue
# json has lots of information we don't need, so we need to get rid of it
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [340]:
df_list = []
for lat, lng, nhood, pcode in zip(df_full['Latitude'], df_full['Longitude'], df_full['Neighbourhood'], df_full['Postal Code']):
    pcode_json = client.venues.explore(params={'ll': f'{lat},{lng}', 'section': 'trending', 'limit': '5', 'sortByPopularity': '1'})
    items = pcode_json['groups'][0]['items']
    dataframe = json_normalize(items)   # convert json into Data Frame
    dataframe = dataframe[['venue.id', 'venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']]
    dataframe['venue.categories'] = dataframe.apply(get_category_type, axis=1) # clean categories column, it has lots of exras
    dataframe[['Postal Code', 'Neighbourhood']] = [pcode, nhood]    # add columns representing/connecting venues with location
    df_list.append(dataframe)   # each Date Frame goes to the list for farther murge

For each Postal Code in Toronto DataFrame we got 5 the most trending venues

In [341]:
len(df_list)    # for 103 Postal codes we got 103 table, one for each post code

103

In [344]:
df_venuse = pd.concat(df_list)  # merge 103 tables into one
df_venuse.shape

(515, 7)

In [345]:
df_venuse.head()

Unnamed: 0,venue.id,venue.name,venue.categories,venue.location.lat,venue.location.lng,Postal Code,Neighbourhood
0,4aef94c7f964a52060d921e3,Walmart Supercentre,Big Box Store,43.833671,-79.256036,M1B,"Malvern, Rouge"
1,4b59bb58f964a5201a9528e3,Mike & Lori's No Frills,Grocery Store,43.798476,-79.141303,M1B,"Malvern, Rouge"
2,4b847328f964a520293631e3,Food Basics,Supermarket,43.770184,-79.184852,M1B,"Malvern, Rouge"
3,4da09a5758c2224ba2a95679,Rouge National Urban Park,National Park,43.818747,-79.170414,M1B,"Malvern, Rouge"
4,4c880ad1da5da1cd70f030e9,Dollarama,Discount Store,43.783187,-79.202538,M1B,"Malvern, Rouge"


15 the most trending venues by categories

In [372]:
df_venuse['venue.categories'].value_counts().head(15)

Grocery Store             108
Supermarket                74
Park                       52
Shopping Mall              49
Pharmacy                   29
Coffee Shop                23
Warehouse Store            23
Big Box Store              18
Furniture / Home Store     12
Plaza                      11
Department Store           11
Movie Theater               8
Historic Site               7
Discount Store              6
Fast Food Restaurant        6
Name: venue.categories, dtype: int64

Lets map all the trending venuse on the Toronto map

In [373]:
toronto_map_ven = folium.Map(location=[toronto_lat, toronto_lng], zoom_start=10)
for lat, lng, label in zip(df_venuse['venue.location.lat'], df_venuse['venue.location.lng'], df_venuse['venue.name']):
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        fill=True,
        color='green',
        fill_color='green',
        fill_opacity=0.5
        ).add_to(toronto_map_ven)
toronto_map_ven

> Apperantly GitHub doesn't load folium map, here is an picture of map

![Image of Yaktocat](https://j45p7g.by.files.1drv.com/y4mBIvhsZ5WoZ9o0QbH5VIfLmOyKLr_uBJlbpza2CR8AUyDkS9y5XFQBpC_ez6XvT6Z-p8jl8VS9g8iNcqH4Xs84ohyXPrxWD48PpzcLpJr1MgmPFEOaLo74HBERN5ZW3WuGdYFwCyvOSSIuFiqXzh36y7KJjJ-IZjDHcG3ioqZ0g7fSpcm7lvYmIEy0NUYx4HFKEyq-sbGxfjExLirOf3jqA)