## Applied Data Science Capstone Project

In [5]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np
from bs4 import BeautifulSoup
import re

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

from sklearn.cluster import KMeans 

In [6]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 10 # A default Foursquare API limit value

VENUES_URL = "https://api.foursquare.com/v2/venues/"


In [7]:
def getAuthParams():
    return f'client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}&v={VERSION}'

def fetchUrlresponse(url):
    target_url = url+getAuthParams()
    print(target_url)
    return requests.get(target_url).json()['response']

def getLocationForAddress(address):
    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(address)
    print(f'The geograpical coordinate of {address} are {location.latitude}, {location.longitude}.')
    return location.latitude,location.longitude

In [8]:
cities_df = pd.read_csv('worldcities.csv', 
                            encoding = "ISO-8859-1",
                            dtype={'Div1Airport': str, 'Div1TailNum': str, 
                                   'Div2Airport': str, 'Div2TailNum': str})

In [9]:
cities_df.columns

Index(['city', 'city_ascii', 'lat', 'lng', 'country', 'iso2', 'iso3',
       'admin_name', 'capital', 'population', 'id'],
      dtype='object')

In [10]:
cities_df.head()

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6897,139.6922,Japan,JP,JPN,TÅkyÅ,primary,37977000.0,1392685764
1,Jakarta,Jakarta,-6.2146,106.8451,Indonesia,ID,IDN,Jakarta,primary,34540000.0,1360771077
2,Delhi,Delhi,28.66,77.23,India,IN,IND,Delhi,admin,29617000.0,1356872604
3,Mumbai,Mumbai,18.9667,72.8333,India,IN,IND,MahÄrÄshtra,admin,23355000.0,1356226629
4,Manila,Manila,14.5958,120.9772,Philippines,PH,PHL,Manila,primary,23088000.0,1608618140


In [11]:
cities_df.drop(columns=['city','iso2','iso3','admin_name','capital','id','population'],inplace=True)

In [12]:
cities_df.rename(columns={'city_ascii':'city'},inplace=True)

In [13]:
cities_df.head()

Unnamed: 0,city,lat,lng,country
0,Tokyo,35.6897,139.6922,Japan
1,Jakarta,-6.2146,106.8451,Indonesia
2,Delhi,28.66,77.23,India
3,Mumbai,18.9667,72.8333,India
4,Manila,14.5958,120.9772,Philippines


In [14]:
venue_cat_url = f'{VENUES_URL}categories?'

In [15]:
venues_cat_response = fetchUrlresponse(venue_cat_url)

https://api.foursquare.com/v2/venues/categories?client_id=YSGVRW31EIOZBPME51ZY20BAVZWXUIUHTUA3Z2O405R0OBDO&client_secret=P5WKPSON2BR51PKVITUNW5HG0SHD4CZTGN02SMSQNKGXULJR&v=20180605


In [16]:
venues_cat_response

{'categories': [{'id': '4d4b7104d754a06370d81259',
   'name': 'Arts & Entertainment',
   'pluralName': 'Arts & Entertainment',
   'shortName': 'Arts & Entertainment',
   'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/arts_entertainment/default_',
    'suffix': '.png'},
   'categories': [{'id': '56aa371be4b08b9a8d5734db',
     'name': 'Amphitheater',
     'pluralName': 'Amphitheaters',
     'shortName': 'Amphitheater',
     'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/arts_entertainment/default_',
      'suffix': '.png'},
     'categories': []},
    {'id': '4fceea171983d5d06c3e9823',
     'name': 'Aquarium',
     'pluralName': 'Aquariums',
     'shortName': 'Aquarium',
     'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/arts_entertainment/aquarium_',
      'suffix': '.png'},
     'categories': []},
    {'id': '4bf58dd8d48988d1e1931735',
     'name': 'Arcade',
     'pluralName': 'Arcades',
     'shortName': 'Arcade',
     'icon': {'prefix': 'https://

In [17]:
main_cat_columns = ['name','id','icon_url']
main_categories = pd.DataFrame(columns=main_cat_columns)

In [18]:
for category in venues_cat_response['categories']:
    main_categories = main_categories.append(
        {'name':category['name'],
         'id':category['id'],
         'icon_url':f"{category['icon']['prefix']}.{category['icon']['suffix']}"
        },ignore_index=True
    )

In [19]:
print(main_categories.shape)
main_categories.head()

(10, 3)


Unnamed: 0,name,id,icon_url
0,Arts & Entertainment,4d4b7104d754a06370d81259,https://ss3.4sqi.net/img/categories_v2/arts_en...
1,College & University,4d4b7105d754a06372d81259,https://ss3.4sqi.net/img/categories_v2/educati...
2,Event,4d4b7105d754a06373d81259,https://ss3.4sqi.net/img/categories_v2/event/d...
3,Food,4d4b7105d754a06374d81259,https://ss3.4sqi.net/img/categories_v2/food/de...
4,Nightlife Spot,4d4b7105d754a06376d81259,https://ss3.4sqi.net/img/categories_v2/nightli...


In [20]:
MAX_SUB_CATEGORIES = 100

In [21]:
sub_cat_columns = ['parent_id','name','id','icon_url']
sub_categories = pd.DataFrame(columns=sub_cat_columns)

In [22]:
for main_category in venues_cat_response['categories']:
    for i,category in enumerate(main_category['categories']):
        if i==MAX_SUB_CATEGORIES:
            break
        sub_categories = sub_categories.append(
            {
                'parent_id':main_category['id'],
                'name':category['name'],
                'id':category['id'],
                'icon_url':f"{category['icon']['prefix']}.{category['icon']['suffix']}"
            },ignore_index=True
        )

In [23]:
print(sub_categories.shape)
sub_categories.head()

(423, 4)


Unnamed: 0,parent_id,name,id,icon_url
0,4d4b7104d754a06370d81259,Amphitheater,56aa371be4b08b9a8d5734db,https://ss3.4sqi.net/img/categories_v2/arts_en...
1,4d4b7104d754a06370d81259,Aquarium,4fceea171983d5d06c3e9823,https://ss3.4sqi.net/img/categories_v2/arts_en...
2,4d4b7104d754a06370d81259,Arcade,4bf58dd8d48988d1e1931735,https://ss3.4sqi.net/img/categories_v2/arts_en...
3,4d4b7104d754a06370d81259,Art Gallery,4bf58dd8d48988d1e2931735,https://ss3.4sqi.net/img/categories_v2/arts_en...
4,4d4b7104d754a06370d81259,Bowling Alley,4bf58dd8d48988d1e4931735,https://ss3.4sqi.net/img/categories_v2/arts_en...


In [24]:
def getMainCategories():
    cat_name = list(main_categories['name'].values)
    cat_id = list(main_categories['id'].values)
    return {'cat_id':cat_id,'cat_name' : cat_name}

#### For search api need:
* ll -> lat,lan
* radius -> in m, max 100000
* categoryId -> coma seperated

api base path: `GET https://api.foursquare.com/v2/venues/search`

In [25]:
def getVenueColumns():
    return ['name','lat','lng','distance_m','postalCode','address','cat_name','cat_id']

In [26]:
def getEmptyVenuesDataFrame():
    return pd.DataFrame(columns=getVenueColumns())

In [27]:
def getVenuesForCity(city_name,categories,radius=20000):
    city = cities_df[cities_df['city']==CITY_NAME].reset_index(drop=True).iloc[0]
    lat = city['lat']
    lng = city['lng']
    venus_df = getEmptyVenuesDataFrame()
    
    for i,category_id in enumerate(categories['cat_id']):
#         if i==3:
#             break
        venus_df = venus_df.append(getVenuesForIdAndLocation(category_id,categories['cat_name'][i],radius,lat,lng),ignore_index=True)
    
    return city,venus_df.reset_index(drop=True)

In [28]:
def getVenuesForIdAndLocation(category_id,cat_name,radius,lat,lng):
    ll = f'{lat},{lng}'
    venue_search_url = f"{VENUES_URL}search?ll={ll}&radious={radius}&categoryId={category_id}&"
    venues_df = getEmptyVenuesDataFrame()
    venues_response = fetchUrlresponse(venue_search_url)
    for venue in venues_response['venues']:
        venues_df = venues_df.append(parseVenueResponse(category_id,cat_name,venue),ignore_index=True)
    return venues_df

In [29]:
def parseVenueResponse(cat_id,cat_name,venue):
    try:
        name = venue['name']
    except:
        name = ''
    try:
        lat = venue['location']['lat']
    except:
        lat = np.nan
    try:
        lng = venue['location']['lng']
    except:
        lng = np.nan
    try:
        distance_m = venue['location']['distance']
    except:
        distance_m = ''
    try:
        postalCode = venue['location']['postalCode']
    except:
        postalCode = ''
    try:
        address = ','.join(venue['location']['formattedAddress'])
    except:
        address = ''
    return {
        'name' : name,
        'lat' : lat,
        'lng' : lng,
        'distance_m' : distance_m,
        'postalCode': postalCode,
        'address' : address,
        'cat_id': cat_id,
        'cat_name': cat_name,
    }

In [30]:
LOC_SERVICE_API_KEY = '' # Api key for geocoder service
def getAddressForLocation(lat,lng):
    loc_url = f'https://revgeocode.search.hereapi.com/v1/revgeocode?at={lat},{lng}&apikey={LOC_SERVICE_API_KEY}'
    loc_res = requests.get(loc_url).json()
    return loc_res['items'][0]['address']['label']

In [84]:
CITY_NAME = 'Toronto'

In [85]:
city, venue_df = getVenuesForCity(CITY_NAME,getMainCategories())

https://api.foursquare.com/v2/venues/search?ll=43.7417,-79.3733&radious=20000&categoryId=4d4b7104d754a06370d81259&client_id=YSGVRW31EIOZBPME51ZY20BAVZWXUIUHTUA3Z2O405R0OBDO&client_secret=P5WKPSON2BR51PKVITUNW5HG0SHD4CZTGN02SMSQNKGXULJR&v=20180605
https://api.foursquare.com/v2/venues/search?ll=43.7417,-79.3733&radious=20000&categoryId=4d4b7105d754a06372d81259&client_id=YSGVRW31EIOZBPME51ZY20BAVZWXUIUHTUA3Z2O405R0OBDO&client_secret=P5WKPSON2BR51PKVITUNW5HG0SHD4CZTGN02SMSQNKGXULJR&v=20180605
https://api.foursquare.com/v2/venues/search?ll=43.7417,-79.3733&radious=20000&categoryId=4d4b7105d754a06373d81259&client_id=YSGVRW31EIOZBPME51ZY20BAVZWXUIUHTUA3Z2O405R0OBDO&client_secret=P5WKPSON2BR51PKVITUNW5HG0SHD4CZTGN02SMSQNKGXULJR&v=20180605
https://api.foursquare.com/v2/venues/search?ll=43.7417,-79.3733&radious=20000&categoryId=4d4b7105d754a06374d81259&client_id=YSGVRW31EIOZBPME51ZY20BAVZWXUIUHTUA3Z2O405R0OBDO&client_secret=P5WKPSON2BR51PKVITUNW5HG0SHD4CZTGN02SMSQNKGXULJR&v=20180605
https://api.

In [86]:
venue_df.head()

Unnamed: 0,name,lat,lng,distance_m,postalCode,address,cat_name,cat_id
0,Toronto Botanical Garden,43.734104,-79.358321,1471,M3C 1P2,"777 Lawrence Avenue East (Leslie St.),Toronto ...",Arts & Entertainment,4d4b7104d754a06370d81259
1,Union Station,43.645167,-79.380641,10762,M5J 1E6,"65 Front St W (btwn Bay & York St),Toronto ON ...",Arts & Entertainment,4d4b7104d754a06370d81259
2,The Distillery Historic District,43.650244,-79.359323,10242,M5A 3C4,"btwn Front, Cherry, Gardiner & Parliament,Toro...",Arts & Entertainment,4d4b7104d754a06370d81259
3,Toronto Zoo,43.820582,-79.181551,17737,M1B 5K7,361 Old Finch Av (at Meadowvale & Toronto Zoo ...,Arts & Entertainment,4d4b7104d754a06370d81259
4,Downsview Park,43.745867,-79.480454,8629,M3K 2B6,"35 Carl Hall Rd.,Toronto ON M3K 2B6,Canada",Arts & Entertainment,4d4b7104d754a06370d81259


In [87]:
venue_df.shape

(248, 8)

In [88]:
# create map of New York using latitude and longitude values
city_map1 = folium.Map(location=[city['lat'], city['lng']], zoom_start=10)

# add markers to map

for lat, lng, name, address,cat_name in zip(venue_df['lat'], venue_df['lng'], venue_df['name'], venue_df['address'], venue_df['cat_name']):
    label = f'{name},\n{address},\n({cat_name}),({round(lat,4)},{round(lng,4)})'
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color= 'blue',
        fill=True,
        fill_color= 'blue',
        fill_opacity=0.7,
        parse_html=False).add_to(city_map1)  

city_map1

In [89]:
cluster_colors = ['#f59e42','#4296f5','#f54260','#f55a42' , 'purple']
clusterNum = 5

In [90]:
venue_lat_lng = venue_df[['lat','lng']]

k_means = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 20)
k_means.fit(venue_lat_lng)

labels = k_means.labels_
centers = pd.DataFrame(k_means.cluster_centers_,columns=['lat','lng'])

In [91]:
centers

Unnamed: 0,lat,lng
0,43.788791,-79.334557
1,43.627809,-79.653148
2,43.660943,-79.401128
3,43.84685,-79.476401
4,43.839406,-79.084074


In [92]:
center_address = pd.Series(dtype='str')

for lat, lng in zip(centers['lat'], centers['lng']):
    center_address = center_address.append(pd.Series([getAddressForLocation(lat,lng)]))
    
center_address.reset_index(drop=True,inplace=True)
centers['address'] = center_address

In [93]:
centers

Unnamed: 0,lat,lng,address
0,43.788791,-79.334557,"8 Bickerton Cres, Toronto, ON M2J 3S8, Canada"
1,43.627809,-79.653148,"LG Electronics Canada (Goldstar), 550 Matheson..."
2,43.660943,-79.401128,"582 Spadina Ave, Toronto, ON M5S 2H2, Canada"
3,43.84685,-79.476401,"32 Apple Grove Ct, Vaughan, ON L6A 4C2, Canada"
4,43.839406,-79.084074,"1400 The Esplanade N, Pickering, ON L1V 6V2, C..."


In [94]:
# create map of New York using latitude and longitude values
city_map = folium.Map(location=[city['lat'], city['lng']], zoom_start=10)

# add markers to map

for lat, lng, name, address,cat_name,cluster_label in zip(venue_df['lat'], venue_df['lng'], venue_df['name'], venue_df['address'], venue_df['cat_name'],labels):
    label = f'{name},\n{address},\n({cat_name}),({round(lat,4)},{round(lng,4)})'
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=cluster_colors[cluster_label],
        fill=True,
        fill_color=cluster_colors[cluster_label],
        fill_opacity=0.7,
        parse_html=False).add_to(city_map)  
    

for cluster_num,lat, lng,center_add in zip(range(clusterNum),centers['lat'], centers['lng'],centers['address']):
    label = f'Center: {cluster_num+1}, {center_add} ({round(lat,4)},{round(lng,4)})'
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=10,
        popup=label,
        color='blue',
        fill=True,
        fill_color=cluster_colors[cluster_num],
        fill_opacity=0.7,
        parse_html=False).add_to(city_map)  
    
    
city_map