# Capstone project

In this project I will analyse top 40 most visited cities.

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests


## Get cities data

Read the list of 40 most visited cities and get their coordinates

In [2]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="ny_explorer")

r = requests.get("https://www.listchallenges.com/top-100-most-visited-cities-in-the-world")
soup = BeautifulSoup (r.text, "html.parser")

divs = soup.findAll("div", {"class": "item-name"})
cities = []
for div in divs:
    name = div.text.strip() 
    location = geolocator.geocode(name)
    cities.append([name, location])
    
cities[0]

['Hong Kong, China',
 Location(香港 Hong Kong, 中西區 Central and Western District, HK, PACIFIC PLACE, 中国, (22.2793278, 114.1628131, 0.0))]

## Collect Foursquare data

Get a list of venues around city centers

0. Prepare credentials
1. Read category tree
2. Read venues
3. Transform venue data from json into dataframe
0. Save it for future use

In [3]:
# Authentication preparation

CLIENT_ID = 'REPLACE WITH YOUR ID OR CREATE A FILE FS_cred.txt' # your Foursquare ID
CLIENT_SECRET = 'REPLACE WITH YOUR SECRET OR CREATE A FILE FS_cred.txt' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
try:
    filename = "FS_cred.txt" 
    # This block will read 
    with open(filename, "r") as f:
        lines = f.readlines()
        CLIENT_ID = lines[0].strip()
        CLIENT_SECRET = lines[1].strip()
        print ("Foursquare credentials read from '{}'".format(filename))
except:
    print ("Could not read '{}'".format(filename))
    
AUTH="client_id={}&client_secret={}&v={}&".format(CLIENT_ID, CLIENT_SECRET, VERSION)

#print('Your credentails:', AUTH)

Foursquare credentials read from 'FS_cred.txt'


In [4]:
# Reading category trees
categories = dict()

def topParent(me):
    if me["parent"] == None:
        return me
    else:
        return topParent(me["parent"])
    
def traverseCategories(lst, parent=None):
    for i in lst:
        c = dict()
        c["id"]=i["id"]
        c["name"]=i["name"]
        #print(c["id"], c["name"], " " if parent == None else "({})".format(parent["name"]))
        categories[c["id"]] = c
        c["parent"] = parent
        traverseCategories(i["categories"], c)
        
def primaryCategory(venue):
    if len(venue["categories"]) == 0:
        return ""
    return next(filter(lambda x: x["primary"]==True, venue["categories"]))

url = 'https://api.foursquare.com/v2/venues/categories?{}'.format(AUTH)
json_categories = requests.get(url).json()        
traverseCategories(json_categories["response"]["categories"])  

print ("Testing category tree: ")
print ("Must be 'Laser Tag': ", categories["52e81612bcbc57f1066b79e6"]["name"])
print ("Must be 'Arts & Entertainment': ", topParent(categories["4bf58dd8d48988d188941735"])["name"])

Testing category tree: 
Must be 'Laser Tag':  Laser Tag
Must be 'Arts & Entertainment':  Arts & Entertainment


In [12]:
# Single Foursquare API call to get list of venues in a rectangular area
# Retuns list of json venues
def fs_api_request_search_rect(lat, lng, dlat, dlng):
    LIMIT = 1000
    url = 'https://api.foursquare.com/v2/venues/search?{}&intent=browse&sw={},{}&ne={},{}&limit={}'.format(
                AUTH, 
                lat, 
                lng, 
                lat+dlat, 
                lng+dlng, 
                LIMIT)
    json = requests.get(url).json()
    if not "venues" in json["response"]:
        print("Request failed, will try once again. See response below")
        print(json)
        json = requests.get(url).json()

    return json["response"]["venues"]

# Scan city for venues by making Foursquare calls about several tiles of the city.
# Making several calls about small tiles of the area brings more results than one call about whole area
def scan_city(center_lat, center_lng):
    jsonAllCityVenues = []
    dlat = 0.005
    dlng = 0.005
    n = 4
    for i in range(n):
        for j in range(n):
            l1 = center_lat+(i-n/2)*dlat
            l2 = center_lng+(j-n/2)*dlng
            jsonVenues = fs_api_request_search_rect(l1, l2, dlat, dlng)
            print("Tile ({},{}): {} venues".format(i, j, len(jsonVenues)), end="\r")
            jsonAllCityVenues = jsonAllCityVenues + jsonVenues
    print()
    return jsonAllCityVenues

In [11]:
# Parses json tree with venues and adds them to a list (venues_list variable)
def append_venues_list(city, json_venues):
    for json_venue in json_venues:
        cnum = len(json_venue["categories"])
        if cnum > 0:
            category = primaryCategory(json_venue)
            venues_list.append([
                city,
                json_venue["id"],
                json_venue["name"],
                json_venue['location']['lat'],
                json_venue['location']['lng'],
                category["id"],
                category["name"],
                topParent(categories[category["id"]])["name"]
            ]
            )
        else:
            pass
#            print ("Venue '{}' does not have categories. Ignoring".format(json_venue["name"]))

In [13]:
venues_list=[]
for i in range(0, len(cities)):
    city = cities[i]
    json_venues = scan_city( city[1].latitude, city[1].longitude)
    append_venues_list(city[0], json_venues)
    print ("City: {}; lat,lon: {}, {}; venues: {}".format(city[0], city[1].latitude, city[1].longitude, len(json_venues)))


Tile (3,3): 173 venues
City: Hong Kong, China; lat,lon: 22.2793278, 114.1628131; venues: 2827
Tile (3,3): 84 venuess
City: Singapore; lat,lon: 1.2904753, 103.8520359; venues: 2961
Tile (3,3): 184 venues
City: Bangkok, Thailand; lat,lon: 13.7538929, 100.8160803; venues: 2809
Tile (3,3): 98 venuess
City: London, United Kingdom; lat,lon: 51.5073219, -0.1276474; venues: 2499
Tile (3,3): 115 venues
City: Paris, France; lat,lon: 48.8566101, 2.3514992; venues: 2431
Tile (3,3): 81 venuess
City: Macau; lat,lon: 22.1899448, 113.5380454; venues: 2773
Tile (3,3): 122 venues
City: New York City, USA; lat,lon: 40.7308619, -73.9871558; venues: 2647
Tile (3,3): 139 venues
City: Shenzhen, China; lat,lon: 22.5445697, 114.0545346; venues: 2608
Tile (3,3): 199 venues
City: Kuala Lumpur, Malaysia; lat,lon: 3.1516636, 101.6943028; venues: 2514
Tile (3,3): 73 venuess
City: Antalya, Turkey; lat,lon: 36.9009641, 30.6954846; venues: 2346
Tile (3,3): 197 venues
City: Istanbul, Turkey; lat,lon: 41.0096334, 28.965

In [527]:
venues_list = venues_list[:-2461]

In [15]:
df_venues = pd.DataFrame(venues_list)
df_venues.columns=["city", "id", "name", "lat", "lng", "category_id", "category_name", "top_category"]
df_venues.head()

Unnamed: 0,city,id,name,lat,lng,category_id,category_name,top_category
0,"Hong Kong, China",4ddbc83eb0fb2604df70f99c,22 Barker Road 白加道22號,22.271539,114.156219,4d954b06a243a5684965b473,Residential Building (Apartment / Condo),Residence
1,"Hong Kong, China",4db74d2c43a10648ae161f3a,Mountain View 山景,22.26871,114.152955,4d954b06a243a5684965b473,Residential Building (Apartment / Condo),Residence
2,"Hong Kong, China",517a2b76e4b06253b7e3fe5d,Peak Tram May Road Station (山頂纜車梅道站),22.273614,114.155979,4bf58dd8d48988d1fc931735,Light Rail Station,Travel & Transport
3,"Hong Kong, China",517a3f55498ea18ae390a21a,Peak Tram Kennedy Road Station (山頂纜車堅尼地道站),22.276572,114.158045,52f2ab2ebcbc57f1066b8b51,Tram Station,Travel & Transport
4,"Hong Kong, China",4d3145f85017a09334e23d9b,Dynasty Court 帝景園,22.275611,114.152694,4f2a210c4b9023bd5841ed28,Housing Development,Residence


In [16]:
df_venues.to_csv("venues_all_40.csv", sep="|", index=False)
df_venues = pd.read_csv("venues_all_40.csv", sep="|")
df_venues.head()

Unnamed: 0,city,id,name,lat,lng,category_id,category_name,top_category
0,"Hong Kong, China",4ddbc83eb0fb2604df70f99c,22 Barker Road 白加道22號,22.271539,114.156219,4d954b06a243a5684965b473,Residential Building (Apartment / Condo),Residence
1,"Hong Kong, China",4db74d2c43a10648ae161f3a,Mountain View 山景,22.26871,114.152955,4d954b06a243a5684965b473,Residential Building (Apartment / Condo),Residence
2,"Hong Kong, China",517a2b76e4b06253b7e3fe5d,Peak Tram May Road Station (山頂纜車梅道站),22.273614,114.155979,4bf58dd8d48988d1fc931735,Light Rail Station,Travel & Transport
3,"Hong Kong, China",517a3f55498ea18ae390a21a,Peak Tram Kennedy Road Station (山頂纜車堅尼地道站),22.276572,114.158045,52f2ab2ebcbc57f1066b8b51,Tram Station,Travel & Transport
4,"Hong Kong, China",4d3145f85017a09334e23d9b,Dynasty Court 帝景園,22.275611,114.152694,4f2a210c4b9023bd5841ed28,Housing Development,Residence


# Cluster cities by venue categories (top-level)

**Prepare features of the cities. Calculate ratio of categories, impotant for tourists**

In [60]:
onehot = pd.get_dummies(df_venues[['top_category']], prefix="", prefix_sep="")
onehot = onehot.drop(["Professional & Other Places", "Residence", "Event", "Travel & Transport"], 1)
onehot['city'] = df_venues['city'] 
fixed_columns = [onehot.columns[-1]] + list(onehot.columns[:-1])
onehot = onehot[fixed_columns]
df_cities = onehot.groupby('city').mean().reset_index()
df_cities.head()

Unnamed: 0,city,Arts & Entertainment,College & University,Food,Nightlife Spot,Outdoors & Recreation,Shop & Service
0,"Amsterdam, Netherlands",0.068851,0.014102,0.23849,0.088345,0.063874,0.194525
1,"Antalya, Turkey",0.051909,0.012441,0.126984,0.028314,0.066495,0.345774
2,"Bangkok, Thailand",0.010443,0.020886,0.340852,0.032999,0.061404,0.273601
3,"Barcelona, Spain",0.052873,0.016214,0.297497,0.089884,0.076489,0.200564
4,"Beijing, China",0.163474,0.009354,0.381292,0.02049,0.068151,0.099332


In [61]:
#df_cities["Arts & Entertainment"] /= df_cities["Arts & Entertainment"].max()
#df_cities["College & University"] /= df_cities["College & University"].max()
#df_cities["Food"] /= df_cities["Food"].max()
#df_cities["Nightlife Spot"] /= df_cities["Nightlife Spot"].max()
#df_cities["Outdoors & Recreation"] /= df_cities["Outdoors & Recreation"].max()
#df_cities["Shop & Service"] /= df_cities["Shop & Service"].max()
df_cities

Unnamed: 0,city,Arts & Entertainment,College & University,Food,Nightlife Spot,Outdoors & Recreation,Shop & Service
0,"Amsterdam, Netherlands",0.068851,0.014102,0.23849,0.088345,0.063874,0.194525
1,"Antalya, Turkey",0.051909,0.012441,0.126984,0.028314,0.066495,0.345774
2,"Bangkok, Thailand",0.010443,0.020886,0.340852,0.032999,0.061404,0.273601
3,"Barcelona, Spain",0.052873,0.016214,0.297497,0.089884,0.076489,0.200564
4,"Beijing, China",0.163474,0.009354,0.381292,0.02049,0.068151,0.099332
5,"Berlin, Germany",0.059406,0.048605,0.172367,0.039604,0.032403,0.225473
6,"Budapest, Hungary",0.070539,0.021501,0.158431,0.061486,0.107507,0.174651
7,"Dubai, United Arab Emirates",0.015744,0.022901,0.214218,0.031966,0.186069,0.240935
8,"Florence, Italy",0.079353,0.029661,0.340524,0.072419,0.070108,0.179507
9,"Guangzhou, China",0.045695,0.010101,0.521886,0.013949,0.029822,0.149591


**Do cluster and assign labels to cities**

In [62]:
from sklearn.cluster import KMeans
kclusters = 5
city_clustering = df_cities.drop("city", 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(city_clustering)
df_cities["Class"] = kmeans.labels_
df_cities.head()

Unnamed: 0,city,Arts & Entertainment,College & University,Food,Nightlife Spot,Outdoors & Recreation,Shop & Service,Class
0,"Amsterdam, Netherlands",0.068851,0.014102,0.23849,0.088345,0.063874,0.194525,3
1,"Antalya, Turkey",0.051909,0.012441,0.126984,0.028314,0.066495,0.345774,0
2,"Bangkok, Thailand",0.010443,0.020886,0.340852,0.032999,0.061404,0.273601,4
3,"Barcelona, Spain",0.052873,0.016214,0.297497,0.089884,0.076489,0.200564,4
4,"Beijing, China",0.163474,0.009354,0.381292,0.02049,0.068151,0.099332,2


**Now lets see what clusters are and how we can interpret them**

In [63]:
centers = pd.DataFrame(kmeans.cluster_centers_)
centers.columns = city_clustering.columns
centers["# of cities"] = df_cities.groupby("Class").count()["city"]
centers

Unnamed: 0,Arts & Entertainment,College & University,Food,Nightlife Spot,Outdoors & Recreation,Shop & Service,# of cities
0,0.071743,0.037389,0.170973,0.047909,0.054162,0.308551,9
1,0.034909,0.008797,0.490592,0.036885,0.041085,0.159237,7
2,0.106579,0.017596,0.312427,0.048574,0.08675,0.14105,5
3,0.058282,0.036757,0.199926,0.050758,0.076793,0.18954,14
4,0.036701,0.014771,0.319613,0.063608,0.055729,0.23925,5


In [64]:
df_cities

Unnamed: 0,city,Arts & Entertainment,College & University,Food,Nightlife Spot,Outdoors & Recreation,Shop & Service,Class
0,"Amsterdam, Netherlands",0.068851,0.014102,0.23849,0.088345,0.063874,0.194525,3
1,"Antalya, Turkey",0.051909,0.012441,0.126984,0.028314,0.066495,0.345774,0
2,"Bangkok, Thailand",0.010443,0.020886,0.340852,0.032999,0.061404,0.273601,4
3,"Barcelona, Spain",0.052873,0.016214,0.297497,0.089884,0.076489,0.200564,4
4,"Beijing, China",0.163474,0.009354,0.381292,0.02049,0.068151,0.099332,2
5,"Berlin, Germany",0.059406,0.048605,0.172367,0.039604,0.032403,0.225473,3
6,"Budapest, Hungary",0.070539,0.021501,0.158431,0.061486,0.107507,0.174651,3
7,"Dubai, United Arab Emirates",0.015744,0.022901,0.214218,0.031966,0.186069,0.240935,3
8,"Florence, Italy",0.079353,0.029661,0.340524,0.072419,0.070108,0.179507,2
9,"Guangzhou, China",0.045695,0.010101,0.521886,0.013949,0.029822,0.149591,1


**Assign label to cluster _based on particular numbers we see_**

In [65]:
class_names = {
    centers["Food"                ].idxmax(): "Eat",
    centers["Nightlife Spot"      ].idxmax(): "Dance",
    centers["# of cities"         ].idxmax(): "Balanced",
    centers["Shop & Service"      ].idxmax(): "Shop",
    centers["Arts & Entertainment"].idxmax(): "Look"
}
centers["Label"] = centers.index.map(lambda x: class_names[x])
centers

Unnamed: 0,Arts & Entertainment,College & University,Food,Nightlife Spot,Outdoors & Recreation,Shop & Service,# of cities,Label
0,0.071743,0.037389,0.170973,0.047909,0.054162,0.308551,9,Shop
1,0.034909,0.008797,0.490592,0.036885,0.041085,0.159237,7,Eat
2,0.106579,0.017596,0.312427,0.048574,0.08675,0.14105,5,Look
3,0.058282,0.036757,0.199926,0.050758,0.076793,0.18954,14,Balanced
4,0.036701,0.014771,0.319613,0.063608,0.055729,0.23925,5,Dance


**Here is our classification**

In [66]:
df_cities["Class"] = df_cities["Class"].map(lambda x: class_names[x])
df_cities

Unnamed: 0,city,Arts & Entertainment,College & University,Food,Nightlife Spot,Outdoors & Recreation,Shop & Service,Class
0,"Amsterdam, Netherlands",0.068851,0.014102,0.23849,0.088345,0.063874,0.194525,Balanced
1,"Antalya, Turkey",0.051909,0.012441,0.126984,0.028314,0.066495,0.345774,Shop
2,"Bangkok, Thailand",0.010443,0.020886,0.340852,0.032999,0.061404,0.273601,Dance
3,"Barcelona, Spain",0.052873,0.016214,0.297497,0.089884,0.076489,0.200564,Dance
4,"Beijing, China",0.163474,0.009354,0.381292,0.02049,0.068151,0.099332,Look
5,"Berlin, Germany",0.059406,0.048605,0.172367,0.039604,0.032403,0.225473,Balanced
6,"Budapest, Hungary",0.070539,0.021501,0.158431,0.061486,0.107507,0.174651,Balanced
7,"Dubai, United Arab Emirates",0.015744,0.022901,0.214218,0.031966,0.186069,0.240935,Balanced
8,"Florence, Italy",0.079353,0.029661,0.340524,0.072419,0.070108,0.179507,Look
9,"Guangzhou, China",0.045695,0.010101,0.521886,0.013949,0.029822,0.149591,Eat


In [69]:
df_hist["Class5"] = df_cities["Class"]
df_hist.to_csv("hist.csv", sep="|", index=False)
df_hist = pd.read_csv("hist.csv", sep="|")
df_hist

Unnamed: 0,city,Class,Class1,Class2,Class3,Class4,Class5
0,"Amsterdam, Netherlands",Dance,Balanced,Dance,Dance,Look,Balanced
1,"Antalya, Turkey",Shop,Shop,Shop,Shop,Look,Shop
2,"Bangkok, Thailand",Shop,Balanced,Balanced,Balanced,Balanced,Dance
3,"Barcelona, Spain",Shop,Balanced,Balanced,Balanced,Dance,Dance
4,"Beijing, China",Look,Look,Look,Look,Eat,Look
5,"Berlin, Germany",Shop,Shop,Shop,Shop,Balanced,Balanced
6,"Budapest, Hungary",Look,Look,Look,Look,Shop,Balanced
7,"Dubai, United Arab Emirates",Shop,Shop,Shop,Shop,Look,Balanced
8,"Florence, Italy",Eat,Balanced,Balanced,Balanced,Look,Look
9,"Guangzhou, China",Eat,Eat,Eat,Eat,Eat,Eat


In [500]:
df_cities.to_csv("cities4.csv", sep="|", index=False)

In [84]:
for class_name in class_names.values():
    print(class_name)
    print(", ".join(df_cities[df_cities.Class == class_name].city.values))
    

Eat
Guangzhou, China, Ho Chi Minh City, Vietnam, Macau, Seoul, South Korea, Shanghai, China, Shenzhen, China, Taipei, Taiwan
Dance
Bangkok, Thailand, Barcelona, Spain, Hong Kong, China, Pattaya, Thailand, Singapore
Balanced
Amsterdam, Netherlands, Berlin, Germany, Budapest, Hungary, Dubai, United Arab Emirates, Johannesburg, South Africa, Kuala Lumpur, Malaysia, Lima, Peru, London, United Kingdom, Los Angeles, USA, Mecca, Saudi Arabia, Miami, USA, Orlando, USA, Phuket, Thailand, Vienna, Austria
Shop
Antalya, Turkey, Istanbul, Turkey, Las Vegas, USA, Milan, Italy, Moscow, Russia, New York City, USA, Paris, France, Prague, Czech Republic, Sofia, Bulgaria
Look
Beijing, China, Florence, Italy, Rome, Italy, Tokyo, Japan, Venice, Italy


In [83]:
print('1')

1


# Find "concentration points" for food/entertainment/etc venues


In [70]:
def find_centers(city, category, df_venues):
    df_to_cluster = df_venues[(df_venues.city == city) & (df_venues.top_category == category)][["lat", "lng"]]
#     print ("Finding clusters for '{}'. {} venues".format(category, len(df_to_cluster)))
    kmeans = KMeans(len(df_to_cluster) // 5, random_state=0).fit(df_to_cluster)
    df_to_cluster["Class"]=kmeans.labels_

    df_clusters = pd.DataFrame(kmeans.cluster_centers_)
    df_clusters.columns = ["lat", "lng"]
    df_clusters["Cnt"] = df_to_cluster.groupby("Class").count()["lat"]
    df_clusters = df_clusters[df_clusters.Cnt >= 5].copy()
    df_clusters.sort_values("Cnt", ascending=False, inplace=True)
    return df_clusters.head(3)

In [90]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
# set color scheme for the clusters
cs = ["Arts & Entertainment", "Food", "Shop & Service", "Nightlife Spot"]
k = len(cs)
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

In [91]:
city_number = 25
city_name = cities[city_number][0]
city_location = cities[city_number][1]
category = "Shop & Service"
df_centers = find_centers(cities[city_number][0], category, df_venues)

In [94]:
city_map = folium.Map(location=[city_location.latitude, city_location.longitude], zoom_start=14)
for i in range(len(cs)):
    category = cs[i]
    color = rainbow[i]
#     print (category, color)
    df_centers = find_centers(cities[city_number][0], category, df_venues)
    for idx in df_centers.index:
        lat = df_centers.lat[idx]
        lng = df_centers.lng[idx]
        cnt = df_centers.Cnt[idx]
        label = '{} - {} venues'.format(category, cnt, lat, lng)
        folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=8,
            popup=label,
            color='black',
            fill=True,
            fill_color=color,
            fill_opacity=0.7,
            parse_html=False).add_to(city_map) 

city_map

In [74]:
import matplotlib.cm as cm
import matplotlib.colors as colors



df_to_cluster = df_venues[(df_venues.city == city_name) & (df_venues.top_category == "Food")][["lat", "lng"]]
kmeans = KMeans(len(df_to_cluster) // 5, random_state=0).fit(df_to_cluster)
df_to_cluster["Class"]=kmeans.labels_

# set color scheme for the clusters
k = df_to_cluster.Class.max() + 1
x = np.arange(3)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

    
city_map = folium.Map(location=[city_location.latitude, city_location.longitude], zoom_start=12)
for idx in df_to_cluster.index:
    lat = df_to_cluster.lat[idx]
    lng = df_to_cluster.lng[idx]
    cls = df_to_cluster.Class[idx]
    label = '{} - {} venues'.format(category, cls)
    
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cls],
        fill=True,
        fill_color=rainbow[cls],
        fill_opacity=0.7,
        parse_html=False).add_to(city_map)  
    
city_map
    

In [489]:
df_to_cluster = df_venues[(df_venues.city == city_name) & (df_venues.top_category == "Food")][["lat", "lng"]]
len(df_to_cluster)

200

0. Посчитать дельту широты и долготы на один километр
0. Задавать координаты квадратом
0. Сканировать несколько квадратов для одного города
0. Сохранять результаты в csv и читать их оттуда
0. Список интересных городов и их координат
0. Кластеризовать отдельные категории в одном городе и изобразить центры кластеров
0. One-hot encoding для категорий точек
0. кластеризовать города