In [1]:
# First we import some of the libraries that we need to complete the assignment

import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis

!conda install -c conda-forge geopy --yes #Install Geopy which will be used in the second part of the assignment
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules for plotting purposes
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage to perform clustering ML
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes #Install Folium to visualise our clusters
import folium # map rendering library

# BeautifulSoup WebScraper Installation
!conda install -c conda-forge beautifulsoup4 --yes

# BeautifulSoup WebScraper
from bs4 import BeautifulSoup

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.18.1               |             py_0          51 KB  conda-forge
    openssl-1.0.2p             |       h470a237_2         3.1 MB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0         conda-forge
    geopy:         1.18.1-py_0       conda-forge

The following packages will be UPDATED:

    openssl:       1.0.2p-h470a237_1 conda-forge --> 1.0.2p-h470a237_2 conda-forge


Downloading and Extracting Packages
geopy-1.18.1         | 51 KB     | #############

  'The soupsieve package is not installed. CSS selectors cannot be used.'


In [2]:
# Setting up cities of interest

For this project I will be using 

## Scraping website for top 150 cities in the world in terms of population 

In [3]:
page = requests.get("https://www.worldatlas.com/citypops.htm").text
soup = BeautifulSoup(page, "lxml")

In [4]:
table = soup.find('table')
table_rows = table.find_all('tr')

In [5]:
rows = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    rows.append(row)
top_cities_df=pd.DataFrame(rows, columns=["Rank","City","Country", "Population"])


In [6]:
top_cities_df.head()

Unnamed: 0,Rank,City,Country,Population
0,,,,
1,1.0,Tokyo,Japan,38001000.0
2,2.0,Delhi,India,25703168.0
3,3.0,Shanghai,China,23740778.0
4,4.0,Sao Paulo,Brazil,21066245.0


In [7]:
#Grab latitudes and longitudes of cities

In [9]:
#geolocations_df = pd.DataFrame(columns=["City","Latitude", "Longitude"])

geolocations = []

for i in top_cities_df['City']:
    
    geolocator = Nominatim()
    location = geolocator.geocode(i)
    latitude = location.latitude
    longitude = location.longitude
    lst=[i, latitude, longitude]
    
    geolocations.append(lst)

  import sys


In [10]:
geolocations_df = pd.DataFrame(columns=["City","Latitude", "Longitude"])

In [11]:
geolocations_df = geolocations_df.append(pd.DataFrame(geolocations, columns=['City','Latitude','Longitude']),ignore_index=True)

In [12]:
geolocations_df.head()

Unnamed: 0,City,Latitude,Longitude
0,,44.933143,7.540121
1,Tokyo,35.682839,139.759455
2,Delhi,28.651718,77.221939
3,Shanghai,31.225344,121.488892
4,Sao Paulo,-23.550651,-46.633382


In [13]:
#Join cities with loaded coordinates
geolocations_df=top_cities_df.join(geolocations_df.set_index('City'), on='City')

In [14]:
#Clean up population feature
geolocations_df['Population'] = geolocations_df.Population.str.replace(",", "")

In [15]:
geolocations_df.head()

Unnamed: 0,Rank,City,Country,Population,Latitude,Longitude
0,,,,,44.933143,7.540121
1,1.0,Tokyo,Japan,38001000.0,35.682839,139.759455
2,2.0,Delhi,India,25703168.0,28.651718,77.221939
3,3.0,Shanghai,China,23740778.0,31.225344,121.488892
4,4.0,Sao Paulo,Brazil,21066245.0,-23.550651,-46.633382


In [16]:
geolocations_df['Population']=pd.to_numeric(geolocations_df['Population'])

In [17]:
geolocations_df.dtypes

Rank           object
City           object
Country        object
Population    float64
Latitude      float64
Longitude     float64
dtype: object

In [18]:
#Convert population into million
geolocations_df['Population'] = geolocations_df['Population'].div(1000000).round(0)

In [19]:
#Rename header to reflect new unit of measure
geolocations_df.rename(columns={'Population': 'Population (Million)'}, inplace=True)

In [20]:
geolocations_df.head()

Unnamed: 0,Rank,City,Country,Population (Million),Latitude,Longitude
0,,,,,44.933143,7.540121
1,1.0,Tokyo,Japan,38.0,35.682839,139.759455
2,2.0,Delhi,India,26.0,28.651718,77.221939
3,3.0,Shanghai,China,24.0,31.225344,121.488892
4,4.0,Sao Paulo,Brazil,21.0,-23.550651,-46.633382


# Assignment Part 2 - Extracting Venues and attaching them to our cities

In [21]:
# Use credentials. Limit of returns set to 100

LIMIT = 100 #Top 10000 results from each city to make sure we cover all venues available

CLIENT_ID = '0VUEBCOET3CHPC5OZSMHTHQGN35HPUS45FMLXWYOE21APWQ2' # your Foursquare ID
CLIENT_SECRET = 'OWFRF4STOAY32AO5JBKZNSYULSPB0MOEYLPDLJZ5L20CUCN3' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 0VUEBCOET3CHPC5OZSMHTHQGN35HPUS45FMLXWYOE21APWQ2
CLIENT_SECRET:OWFRF4STOAY32AO5JBKZNSYULSPB0MOEYLPDLJZ5L20CUCN3


In [22]:
#Create longitude, latitude and name variables
neighbourhood_latitude = geolocations_df.loc[0, 'Latitude'] # City latitude value
neighbourhood_longitude = geolocations_df.loc[0, 'Longitude'] # City longitude value
city_name = geolocations_df.loc[0, 'City'] # City Name

In [23]:
# create map of world using latitude and longitude values of cities to visualise the spread of our sample
map_world = folium.Map(location=[latitude, longitude], zoom_start=2)

# add returned markers to map
for lat, lng, city in zip(geolocations_df['Latitude'], geolocations_df['Longitude'], geolocations_df['City']):
    label = '{}'.format(city)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_world)  
    
map_world

In [25]:
#Declare a function that sends a request to the Foursquare API, returning venues using the parameters specified 
def getNearbyVenues(names, latitudes, longitudes, radius=50000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL. We feed in parameters defined earlier in the notebook.
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius,
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'City Latitude', 
                  'City Latitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [27]:
# Get world venues using venues function

city_venues = getNearbyVenues(names=geolocations_df['City'],
                                   latitudes=geolocations_df['Latitude'],
                                   longitudes=geolocations_df['Longitude']
                                  )

None
Tokyo
Delhi
Shanghai
Sao Paulo
Mumbai
Mexico City 
Beijing
Osaka 
Cairo 
New York
Dhaka
Karachi
Buenos Aires
Kolkata
Istanbul
Chongqing
Lagos
Manila
Rio de Janeiro
Guangzhou
Los Angeles
Moscow 
Kinshasa
Tianjin
Paris
Shenzhen
Jakarta
London
Bangalore
Lima
Chennai 
Seoul
Bogotá
Nagoya 
Johannesburg
Bangkok
Hyderabad
Chicago
Lahore
Tehran
Wuhan
Chengdu
Dongguan
Nanjing
Ahmadabad
Hong Kong
Ho Chi Minh City 
Foshan
Kuala Lumpur
Baghdad
Santiago
Hangzhou
Riyadh 
Shenyang
Madrid
Xi'an
Toronto
Miami
Pune  
Belo Horizonte
Dallas
Surat
Houston
Singapore
Philadelphia
Kitakyushu
Luanda
Suzhou
Haerbin
Barcelona
Atlanta
Khartoum 
Dar es Salaam
Saint Petersburg 
Washington, D.C.
Abidjan
Guadalajara
Yangon
Alexandria 
Ankara
Kabul
Qingdao
Chittagong
Monterrey
Sydney
Dalian
Xiamen
Zhengzhou
Boston
Melbourne
Brasília
Jiddah
Phoenix
Ji'nan 
Montréal
Shantou
Nairobi
Medellín
Fortaleza
Kunming
Changchun
Changsha
Recife
Rome
Zhongshan
Cape Town
Detroit
Hanoi 
Tel Aviv 
Porto Alegre
Kano
Salvador
Faisa

# Explorative Analysis & Final Data Set Assembly

In [189]:
#Shape of city venues dataframe
city_venues=city_venues[city_venues.City.notnull()]
city_venues.shape

(12213, 7)

In [190]:
#Inspect city venues dataframe
city_venues.head()

Unnamed: 0,City,City Latitude,City Latitude.1,Venue,Venue Latitude,Venue Longitude,Venue Category
100,Tokyo,35.682839,139.759455,Palace Hotel Tokyo (パレスホテル東京),35.684644,139.761302,Hotel
101,Tokyo,35.682839,139.759455,Aman Tokyo (アマン東京),35.685515,139.765384,Hotel
102,Tokyo,35.682839,139.759455,Tsujihan (つじ半),35.680763,139.771563,Donburi Restaurant
103,Tokyo,35.682839,139.759455,Indian Curry (インデアンカレー),35.678395,139.765008,Japanese Curry Restaurant
104,Tokyo,35.682839,139.759455,KITTE Garden (屋上庭園 KITTEガーデン),35.679654,139.765169,Garden


In [188]:
city_venues.describe()

Unnamed: 0,City Latitude,City Latitude.1,Venue Latitude,Venue Longitude
count,12213.0,12213.0,12213.0,12213.0
mean,22.089086,27.78239,22.086862,27.780019
std,21.927362,81.957267,21.929287,81.964629
min,-37.814218,-122.419236,-37.832818,-122.449363
25%,13.800038,-51.230377,13.826522,-51.211309
50%,28.197948,34.780527,28.175516,34.773508
75%,36.297494,106.827183,36.305123,106.809673
max,59.938732,151.216454,59.96299,151.25113


In [216]:
city_venues.describe(include=['object'])

Unnamed: 0,City,Venue,Venue Category
count,12213,12213,12213
unique,150,10965,511
top,Tel Aviv,Starbucks (星巴克),Hotel
freq,100,186,1046


In [191]:
#Check number of venues per city. Expected is maximum 100 per city due to the Foursquare API limitation per call.
mylist=city_venues['City'].value_counts()
print(mylist)

Tel Aviv             100
Chennai              100
Recife               100
Puebla               100
Karachi              100
Detroit              100
Kolkata              100
Yangon               100
San Francisco        100
Kano                 100
Ankara               100
Medellín             100
Los Angeles          100
Santiago             100
Rome                 100
Casablanca           100
Singapore            100
Sydney               100
Bangkok              100
Johannesburg         100
Fortaleza            100
Chicago              100
Paris                100
Hyderabad            100
Ho Chi Minh City     100
Santo Domingo        100
Seattle              100
Barcelona            100
Miami                100
Hong Kong            100
                    ... 
Changsha              47
Abidjan               47
Kunming               46
Wuhan                 45
Guadalajara           41
Addis Ababa           40
Haerbin               39
Yaounde               39
Douala                38


In [233]:
#Check Frequency of venues across our whole data set
city_venues['Venue Category'].value_counts()

Hotel                       1046
Coffee Shop                  633
Café                         450
Park                         380
Shopping Mall                333
Italian Restaurant           268
Ice Cream Shop               252
Bakery                       240
Restaurant                   234
Pizza Place                  223
Bar                          183
Indian Restaurant            173
Historic Site                162
Bookstore                    156
Fast Food Restaurant         153
Plaza                        151
Seafood Restaurant           144
Theater                      143
Japanese Restaurant          126
Steakhouse                   123
Chinese Restaurant           117
History Museum               115
Burger Joint                 115
Dessert Shop                 111
BBQ Joint                    100
Art Museum                    99
Sandwich Place                99
French Restaurant             99
Art Gallery                   97
Scenic Lookout                95
          

In [165]:
#Check Frequency of venues across our whole data set
city_venues_grouped_df=(city_venues.groupby(['City', 'Venue Category']).size().rename('Count by city and venue category').reset_index())

Unnamed: 0,City,Venue Category,Count by city and venue category
0,Abidjan,African Restaurant,4
1,Abidjan,American Restaurant,1
2,Abidjan,BBQ Joint,1
3,Abidjan,Bakery,2
4,Abidjan,Bar,1
5,Abidjan,Basketball Stadium,1
6,Abidjan,Beach,1
7,Abidjan,Boutique,1
8,Abidjan,Brewery,1
9,Abidjan,Café,1


In [32]:
city_venues.shape

(12313, 7)

In [36]:
# one hot encoding for dummies
cities_onehot = pd.get_dummies(city_venues[['Venue Category']], prefix="", prefix_sep="")
cities_onehot.head()

Unnamed: 0,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Wings Joint,Women's Store,Xinjiang Restaurant,Yakitori Restaurant,Yoga Studio,Yoshoku Restaurant,Yunnan Restaurant,Zhejiang Restaurant,Zoo,Zoo Exhibit
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
#Add City column back to dataframe
cities_onehot['City'] = city_venues['City'] 

# Move neighborhood column to the first column
fixed_columns = [cities_onehot.columns[-1]] + list(cities_onehot.columns[:-1])
cities_onehot = cities_onehot[fixed_columns]

In [38]:
#Group entries against city values
cities_grouped = cities_onehot.groupby('City').sum().reset_index()

In [54]:
cities_grouped.head()

Unnamed: 0,City,Zoo Exhibit,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,...,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yakitori Restaurant,Yoga Studio,Yoshoku Restaurant,Yunnan Restaurant,Zhejiang Restaurant,Zoo
0,Abidjan,0,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Addis Ababa,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Ahmadabad,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Aleppo,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Alexandria,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#Check top 10 venues for each city
num_top_venues = 10

for hood in cities_grouped['City']:
    print("----"+City+"----")
    temp = cities_grouped[cities_grouped['City'] == City].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [40]:
#Define function that returns most common venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [194]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
cities_venues_sorted = pd.DataFrame(columns=columns)
cities_venues_sorted['City'] = cities_grouped['City']

for ind in np.arange(cities_grouped.shape[0]):
    cities_venues_sorted.iloc[ind, 1:] = return_most_common_venues(cities_grouped.iloc[ind, :], num_top_venues)

In [195]:
cities_venues_sorted

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Abidjan,Nightclub,Ice Cream Shop,African Restaurant,Shopping Mall,Hotel,Italian Restaurant,Bakery,Lounge,Pizza Place,Café
1,Addis Ababa,Hotel,Italian Restaurant,Ethiopian Restaurant,Restaurant,Pizza Place,Nightclub,Turkish Restaurant,Coffee Shop,Greek Restaurant,Grocery Store
2,Ahmadabad,Café,Indian Restaurant,Fast Food Restaurant,Hotel,Restaurant,Coffee Shop,Tea Room,Multiplex,Dessert Shop,Bakery
3,Aleppo,Mountain,RV Park,Market,General College & University,Forest,Café,Moving Target,Fast Food Restaurant,Field,Fish & Chips Shop
4,Alexandria,Coffee Shop,Café,Restaurant,Seafood Restaurant,Hotel,Juice Bar,Bakery,Sandwich Place,Bar,Historic Site
5,Ankara,History Museum,Theater,Dance Studio,Art Gallery,Seafood Restaurant,Café,Bookstore,Historic Site,Pizza Place,Dessert Shop
6,Athens,Bar,Mexican Restaurant,Pizza Place,Grocery Store,Music Venue,Liquor Store,Coffee Shop,New American Restaurant,Fast Food Restaurant,American Restaurant
7,Atlanta,Trail,Park,Brewery,Italian Restaurant,Southern / Soul Food Restaurant,Pizza Place,Ice Cream Shop,Mexican Restaurant,American Restaurant,Bar
8,Baghdad,Café,Shopping Mall,Hotel,Fast Food Restaurant,Middle Eastern Restaurant,Ice Cream Shop,Bakery,Burger Joint,Fried Chicken Joint,Coffee Shop
9,Bangalore,Hotel,Indian Restaurant,Pub,Ice Cream Shop,Lounge,Breakfast Spot,Bakery,Café,Burger Joint,Brewery


# Machine Learning - Clustering


In [196]:
# set number of clusters
kclusters = 5

cities_grouped_clustering = cities_grouped.drop('City', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cities_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([2, 2, 0, 2, 4, 1, 1, 1, 4, 0, 3, 1, 3, 1, 1, 4, 1, 2, 1, 2, 4, 4,
       4, 4, 4, 2, 3, 3, 0, 1, 2, 3, 4, 2, 3, 1, 2, 0, 1, 4, 3, 2, 2, 4,
       3, 2, 2, 3, 4, 2, 3, 3, 2, 3, 2, 1, 0, 2, 1, 1, 3, 3, 2, 4, 3, 2,
       2, 2, 4, 4, 1, 2, 2, 0, 2, 2, 2, 4, 4, 3, 1, 4, 0, 4, 4, 4, 4, 4,
       1, 1, 1, 1, 4, 1, 0, 2, 4, 2, 2, 1, 3, 2, 1, 1, 1, 1, 1, 0, 2, 4,
       1, 4, 1, 1, 4, 1, 1, 2, 1, 4, 1, 1, 1, 3, 2, 3, 3, 2, 4, 3, 0, 3,
       4, 2, 1, 4, 3, 2, 4, 2, 1, 2, 2, 3, 3, 3, 3, 2, 2, 3], dtype=int32)

In [197]:
cities_merged_df = geolocations_df[0:150]
cities_merged_df
#add clustering labels
cities_merged_df['Cluster Labels'] = kmeans.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [198]:
# merge cities grouped with cities df to add latitude/longitude for each city
cities_merged_df = cities_merged_df.join(cities_venues_sorted.set_index('City'), on='City')
cities_merged_df.head() # check the last columns!

Unnamed: 0,Rank,City,Country,Population (Million),Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,,,,,44.933143,7.540121,2,,,,,,,,,,
1,1.0,Tokyo,Japan,38.0,35.682839,139.759455,2,Hotel,Tonkatsu Restaurant,BBQ Joint,Coffee Shop,Japanese Curry Restaurant,Art Museum,Wagashi Place,Japanese Restaurant,Garden,Concert Hall
2,2.0,Delhi,India,26.0,28.651718,77.221939,0,Indian Restaurant,Hotel,Café,Restaurant,Italian Restaurant,Monument / Landmark,Shopping Mall,Mediterranean Restaurant,Market,Bar
3,3.0,Shanghai,China,24.0,31.225344,121.488892,2,Hotel,Dumpling Restaurant,Lounge,Hotel Bar,Coffee Shop,Scenic Lookout,Chinese Restaurant,Italian Restaurant,Café,Spa
4,4.0,Sao Paulo,Brazil,21.0,-23.550651,-46.633382,4,Brazilian Restaurant,Pizza Place,Ice Cream Shop,Theater,Bookstore,Park,Cultural Center,Bar,Art Museum,Restaurant


In [199]:
# create map with colors for the different clusters
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(cities_merged_df['Latitude'], cities_merged_df['Longitude'], cities_merged_df['City'], cities_merged_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [200]:
cluster_1_df=cities_merged_df.loc[cities_merged_df['Cluster Labels'] == 0, cities_merged_df.columns[[1] + list(range(5, cities_merged_df.shape[1]))]]
cluster_1_df


Unnamed: 0,City,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Delhi,77.221939,0,Indian Restaurant,Hotel,Café,Restaurant,Italian Restaurant,Monument / Landmark,Shopping Mall,Mediterranean Restaurant,Market,Bar
9,Cairo,31.243666,0,Hotel,Café,Historic Site,Italian Restaurant,Bakery,Sushi Restaurant,Lounge,Supermarket,Performing Arts Venue,Lebanese Restaurant
28,London,-0.127647,0,Hotel,Theater,Steakhouse,Park,Art Museum,Bookstore,Cocktail Bar,Boutique,Art Gallery,Plaza
37,Hyderabad,78.474629,0,Indian Restaurant,Bakery,Ice Cream Shop,Hotel,Coffee Shop,Diner,Café,BBQ Joint,Shopping Mall,Restaurant
56,Xi'an,108.942425,0,Hotel,Historic Site,History Museum,Coffee Shop,Chinese Restaurant,Fast Food Restaurant,Shopping Mall,Hostel,Plaza,Street Food Gathering
73,Dar es Salaam,39.29764,0,Hotel,Resort,African Restaurant,Restaurant,Bar,Café,Seafood Restaurant,Beach,Bakery,Ice Cream Shop
82,Qingdao,120.349719,0,Hotel,Shopping Mall,Coffee Shop,Beach,Department Store,Bar,Aquarium,Café,Museum,Harbor / Marina
94,Ji'nan,117.114004,0,Coffee Shop,Hotel,Shopping Mall,Park,Department Store,Trail,Train Station,Mountain,Plaza,Fast Food Restaurant
107,Detroit,-83.04664,0,Park,Bakery,Coffee Shop,Steakhouse,Diner,Italian Restaurant,Pizza Place,Brewery,History Museum,Farmers Market
130,Lucknow,80.9346,0,Indian Restaurant,Fast Food Restaurant,Hotel,Shopping Mall,Café,Pizza Place,Bakery,Ice Cream Shop,Multiplex,Market


In [201]:
cluster_venues_1_df=cluster_1_df.drop(['City', 'Longitude', 'Cluster Labels'], axis=1)
cluster_venues_1_df
result_1 = cluster_venues_1_df.apply(pd.value_counts).fillna(0)
result_1 = result_1.sum(axis=1) 
count_cities_cluster_1=len(cluster_1_df)
result_1 = result_1.divide(count_cities_cluster_1)
result_1.sort_values(axis=0, ascending=False, inplace=True, na_position='last')
result_1

Hotel                       0.9
Café                        0.6
Shopping Mall               0.6
Bakery                      0.5
Coffee Shop                 0.5
Ice Cream Shop              0.3
Plaza                       0.3
Italian Restaurant          0.3
Indian Restaurant           0.3
Fast Food Restaurant        0.3
Park                        0.3
Bar                         0.3
Restaurant                  0.3
Pizza Place                 0.2
Beach                       0.2
Steakhouse                  0.2
Department Store            0.2
Diner                       0.2
History Museum              0.2
Historic Site               0.2
Market                      0.2
Bookstore                   0.1
Cocktail Bar                0.1
Chinese Restaurant          0.1
Brewery                     0.1
Boutique                    0.1
BBQ Joint                   0.1
Art Museum                  0.1
Art Gallery                 0.1
Aquarium                    0.1
Farmers Market              0.1
Train St

In [202]:
cluster_2_df=cities_merged_df.loc[cities_merged_df['Cluster Labels'] == 1, cities_merged_df.columns[[1] + list(range(5, cities_merged_df.shape[1]))]]
cluster_2_df

Unnamed: 0,City,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Mumbai,72.835335,1,Indian Restaurant,Café,Hotel,Scenic Lookout,Lounge,Restaurant,Deli / Bodega,Ice Cream Shop,Chinese Restaurant,Bakery
6,Mexico City,-99.133342,1,Ice Cream Shop,Art Museum,Mexican Restaurant,Hotel,Park,Museum,Seafood Restaurant,Concert Hall,History Museum,Coffee Shop
7,Beijing,116.391276,1,Historic Site,Hotel,Park,Chinese Restaurant,Dumpling Restaurant,Brewery,Shopping Mall,Yunnan Restaurant,Peking Duck Restaurant,Hotpot Restaurant
11,Dhaka,90.378814,1,Coffee Shop,Café,Nightclub,Pizza Place,Restaurant,Hotel,Market,Plaza,Shopping Mall,Resort
13,Buenos Aires,-58.437076,1,Ice Cream Shop,Hotel,Argentinian Restaurant,Garden,Speakeasy,Pizza Place,Gym / Fitness Center,Cocktail Bar,Bookstore,Japanese Restaurant
14,Kolkata,88.347602,1,Hotel,Café,Shopping Mall,Mughlai Restaurant,Bakery,Dhaba,Chinese Restaurant,Indian Sweet Shop,Indian Restaurant,Lounge
16,Chongqing,106.549282,1,Hotel,Coffee Shop,Shopping Mall,Fast Food Restaurant,Electronics Store,Historic Site,Public Art,Hostel,Pedestrian Plaza,German Restaurant
18,Manila,120.97997,1,Hotel,Japanese Restaurant,Coffee Shop,Shopping Mall,Filipino Restaurant,Café,Restaurant,Clothing Store,Church,Bakery
29,Bangalore,77.5913,1,Hotel,Indian Restaurant,Pub,Ice Cream Shop,Lounge,Breakfast Spot,Bakery,Café,Burger Joint,Brewery
35,Johannesburg,28.049722,1,Hotel,Coffee Shop,Shopping Mall,Pizza Place,Steakhouse,Indian Restaurant,Golf Course,Gym,Restaurant,Café


In [203]:
cluster_venues_2_df=cluster_2_df.drop(['City', 'Longitude', 'Cluster Labels'], axis=1)
cluster_venues_2_df
result_2 = cluster_venues_2_df.apply(pd.value_counts).fillna(0)
result_2 = result_2.sum(axis=1) 
count_cities_cluster_2=len(cluster_2_df)
result_2 = result_2.divide(count_cities_cluster_2)
result_2.sort_values(axis=0, ascending=False, inplace=True, na_position='last')
result_2

Hotel                            0.736842
Café                             0.657895
Coffee Shop                      0.526316
Park                             0.394737
Ice Cream Shop                   0.368421
Shopping Mall                    0.368421
Italian Restaurant               0.342105
Pizza Place                      0.315789
Restaurant                       0.289474
Bakery                           0.236842
Historic Site                    0.210526
Seafood Restaurant               0.210526
Plaza                            0.184211
Bar                              0.184211
Indian Restaurant                0.157895
Fast Food Restaurant             0.157895
Chinese Restaurant               0.157895
Lounge                           0.131579
History Museum                   0.131579
Steakhouse                       0.131579
Dessert Shop                     0.131579
Resort                           0.105263
Cocktail Bar                     0.105263
Japanese Restaurant              0

In [214]:
cluster_3_df=cities_merged_df.loc[cities_merged_df['Cluster Labels'] == 2, cities_merged_df.columns[[1] + list(range(5, cities_merged_df.shape[1]))]]
cluster_3_df


Unnamed: 0,City,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,,7.540121,2,,,,,,,,,,
1,Tokyo,139.759455,2,Hotel,Tonkatsu Restaurant,BBQ Joint,Coffee Shop,Japanese Curry Restaurant,Art Museum,Wagashi Place,Japanese Restaurant,Garden,Concert Hall
3,Shanghai,121.488892,2,Hotel,Dumpling Restaurant,Lounge,Hotel Bar,Coffee Shop,Scenic Lookout,Chinese Restaurant,Italian Restaurant,Café,Spa
17,Lagos,103.378253,2,Asian Restaurant,Cave,Restaurant,Gift Shop,Hotel,Factory,Falafel Restaurant,Farm,Farmers Market,Flea Market
19,Rio de Janeiro,-43.209373,2,Bookstore,Scenic Lookout,Beach,Park,Bar,Historic Site,Hotel,Church,Ice Cream Shop,Mountain
25,Paris,2.351499,2,Plaza,Hotel,Bookstore,Cocktail Bar,Wine Bar,French Restaurant,Art Museum,Italian Restaurant,Historic Site,Garden
30,Lima,-77.036526,2,Park,Café,Hotel,Restaurant,Peruvian Restaurant,Seafood Restaurant,Japanese Restaurant,Sushi Restaurant,Bakery,Italian Restaurant
33,Bogotá,-74.076103,2,Restaurant,French Restaurant,Asian Restaurant,Italian Restaurant,Park,Café,Pub,Bookstore,Latin American Restaurant,Coffee Shop
36,Bangkok,100.81608,2,Coffee Shop,Hotel,Park,Shopping Mall,Thai Restaurant,Noodle House,Japanese Restaurant,Golf Course,Bookstore,Asian Restaurant
41,Wuhan,114.298441,2,Coffee Shop,Hotel,Shopping Mall,Fast Food Restaurant,Plaza,Airport,Historic Site,Asian Restaurant,Art Museum,Concert Hall


In [205]:
cluster_venues_3_df=cluster_3_df.drop(['City', 'Longitude', 'Cluster Labels'], axis=1)
cluster_venues_3_df
result_3 = cluster_venues_3_df.apply(pd.value_counts).fillna(0)
result_3 = result_3.sum(axis=1) 
count_cities_cluster_3=len(cluster_3_df)
result_3 = result_3.divide(count_cities_cluster_3)
result_3.sort_values(axis=0, ascending=False, inplace=True, na_position='last')
result_3

Hotel                              0.804878
Coffee Shop                        0.634146
Park                               0.463415
Café                               0.439024
Italian Restaurant                 0.390244
Bakery                             0.341463
Shopping Mall                      0.341463
Pizza Place                        0.317073
Restaurant                         0.317073
Ice Cream Shop                     0.243902
Fast Food Restaurant               0.219512
Historic Site                      0.195122
Lounge                             0.195122
Bookstore                          0.170732
Art Museum                         0.170732
Japanese Restaurant                0.170732
Plaza                              0.146341
French Restaurant                  0.146341
BBQ Joint                          0.146341
Chinese Restaurant                 0.121951
Bar                                0.121951
Asian Restaurant                   0.097561
Indian Restaurant               

In [206]:
cluster_4_df=cities_merged_df.loc[cities_merged_df['Cluster Labels'] == 3, cities_merged_df.columns[[1] + list(range(5, cities_merged_df.shape[1]))]]
cluster_4_df

Unnamed: 0,City,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,New York,-73.987156,3,Park,Bakery,Italian Restaurant,Bookstore,Hotel,Cocktail Bar,Art Gallery,Seafood Restaurant,Gourmet Shop,Cycle Studio
12,Karachi,67.182193,3,Café,Coffee Shop,BBQ Joint,Pakistani Restaurant,Shopping Mall,Bakery,Fast Food Restaurant,Burger Joint,Asian Restaurant,Donut Shop
26,Shenzhen,114.054535,3,Hotel,Theme Park Ride / Attraction,Japanese Restaurant,Chinese Restaurant,Coffee Shop,Mountain,Park,Café,Dim Sum Restaurant,Dumpling Restaurant
27,Jakarta,106.827183,3,Hotel,Coffee Shop,Indonesian Restaurant,Shopping Mall,Sushi Restaurant,Restaurant,Japanese Restaurant,Bakery,Nightclub,Chinese Restaurant
31,Chennai,80.283833,3,Indian Restaurant,Hotel,Café,Beach,Ice Cream Shop,Sandwich Place,Multiplex,Italian Restaurant,BBQ Joint,Juice Bar
34,Nagoya,136.90448,3,Unagi Restaurant,Udon Restaurant,Ramen Restaurant,Café,Castle,Beer Bar,Thai Restaurant,Grocery Store,Spa,Rock Club
40,Tehran,51.401378,3,Bookstore,Park,Pastry Shop,History Museum,Flower Shop,Café,Art Gallery,Ice Cream Shop,Persian Restaurant,Theater
44,Nanjing,118.791646,3,Hotel,Park,Coffee Shop,Shopping Mall,Metro Station,Chinese Restaurant,Fast Food Restaurant,Train Station,Historic Site,Sandwich Place
47,Ho Chi Minh City,106.701756,3,Vietnamese Restaurant,Hotel,Café,French Restaurant,Pizza Place,Hotel Bar,Coffee Shop,Massage Studio,Bar,Sandwich Place
50,Baghdad,44.378799,3,Café,Shopping Mall,Hotel,Fast Food Restaurant,Middle Eastern Restaurant,Ice Cream Shop,Bakery,Burger Joint,Fried Chicken Joint,Coffee Shop


In [207]:
cluster_venues_4_df=cluster_4_df.drop(['City', 'Longitude', 'Cluster Labels'], axis=1)
cluster_venues_4_df
result_4 = cluster_venues_4_df.apply(pd.value_counts).fillna(0)
result_4 = result_4.sum(axis=1) 
count_cities_cluster_4=len(cluster_4_df)
result_4 = result_4.divide(count_cities_cluster_4)
result_4.sort_values(axis=0, ascending=False, inplace=True, na_position='last')
result_4

Hotel                            0.821429
Coffee Shop                      0.714286
Shopping Mall                    0.464286
Park                             0.428571
Café                             0.428571
Fast Food Restaurant             0.392857
Bakery                           0.392857
Ice Cream Shop                   0.285714
Pizza Place                      0.214286
Bar                              0.214286
Seafood Restaurant               0.178571
Sandwich Place                   0.178571
Italian Restaurant               0.178571
Restaurant                       0.178571
Train Station                    0.178571
Bookstore                        0.142857
Historic Site                    0.142857
Airport                          0.107143
Chinese Restaurant               0.107143
Juice Bar                        0.107143
Burger Joint                     0.107143
History Museum                   0.107143
Japanese Restaurant              0.107143
French Restaurant                0

In [208]:
cluster_5_df=cities_merged_df.loc[cities_merged_df['Cluster Labels'] == 4, cities_merged_df.columns[[1] + list(range(5, cities_merged_df.shape[1]))]]
cluster_5_df

Unnamed: 0,City,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Sao Paulo,-46.633382,4,Brazilian Restaurant,Pizza Place,Ice Cream Shop,Theater,Bookstore,Park,Cultural Center,Bar,Art Museum,Restaurant
8,Osaka,135.501454,4,Theme Park Ride / Attraction,Ramen Restaurant,Hotel,Coffee Shop,Udon Restaurant,Park,Japanese Curry Restaurant,Scenic Lookout,Japanese Restaurant,Hot Spring
15,Istanbul,28.965165,4,Hotel,Historic Site,Art Gallery,History Museum,Soccer Stadium,Mosque,Park,Dance Studio,Bookstore,Theater
20,Guangzhou,113.259294,4,Hotel,Coffee Shop,Turkish Restaurant,Park,Shopping Mall,Chinese Restaurant,Café,Spa,Seafood Restaurant,Cocktail Bar
21,Los Angeles,-118.242767,4,Italian Restaurant,Trail,Sandwich Place,Ice Cream Shop,Park,Theater,Museum,Farmers Market,Climbing Gym,Sushi Restaurant
22,Moscow,37.617661,4,Park,Yoga Studio,Hotel,Theater,Art Gallery,Plaza,Road,Pizza Place,Art Museum,Bookstore
23,Kinshasa,15.312597,4,Hotel,Café,Restaurant,Fast Food Restaurant,Lounge,Resort,Plaza,Cocktail Bar,Pizza Place,Shopping Mall
24,Tianjin,117.198078,4,Hotel,Coffee Shop,Shopping Mall,Fast Food Restaurant,American Restaurant,Electronics Store,Bar,Pizza Place,Hotpot Restaurant,Department Store
32,Seoul,126.978291,4,BBQ Joint,Korean Restaurant,Park,Coffee Shop,Bakery,Historic Site,Hotel,Palace,Fried Chicken Joint,Japanese Restaurant
39,Lahore,74.314145,4,Café,Pakistani Restaurant,Shopping Mall,Coffee Shop,Fast Food Restaurant,Park,Burger Joint,Department Store,Italian Restaurant,Movie Theater


In [209]:
cluster_venues_5_df=cluster_5_df.drop(['City', 'Longitude', 'Cluster Labels'], axis=1)
cluster_venues_5_df
result_5 = cluster_venues_5_df.apply(pd.value_counts).fillna(0)
result_5 = result_5.sum(axis=1) 
count_cities_cluster_5=len(cluster_5_df)
result_5 = result_5.divide(count_cities_cluster_5)
result_5.sort_values(axis=0, ascending=False, inplace=True, na_position='last')
result_5

Hotel                            0.787879
Coffee Shop                      0.696970
Park                             0.606061
Shopping Mall                    0.484848
Café                             0.393939
Bar                              0.363636
Fast Food Restaurant             0.303030
Restaurant                       0.272727
Pizza Place                      0.272727
Italian Restaurant               0.242424
Historic Site                    0.181818
Theater                          0.181818
Seafood Restaurant               0.181818
Cocktail Bar                     0.151515
Grocery Store                    0.151515
Chinese Restaurant               0.151515
Ice Cream Shop                   0.151515
Bakery                           0.151515
Turkish Restaurant               0.151515
Plaza                            0.121212
Department Store                 0.121212
Japanese Restaurant              0.121212
American Restaurant              0.121212
Art Gallery                      0

In [217]:
cluster_venues_5_df


Unnamed: 0,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Brazilian Restaurant,Pizza Place,Ice Cream Shop,Theater,Bookstore,Park,Cultural Center,Bar,Art Museum,Restaurant
8,Theme Park Ride / Attraction,Ramen Restaurant,Hotel,Coffee Shop,Udon Restaurant,Park,Japanese Curry Restaurant,Scenic Lookout,Japanese Restaurant,Hot Spring
15,Hotel,Historic Site,Art Gallery,History Museum,Soccer Stadium,Mosque,Park,Dance Studio,Bookstore,Theater
20,Hotel,Coffee Shop,Turkish Restaurant,Park,Shopping Mall,Chinese Restaurant,Café,Spa,Seafood Restaurant,Cocktail Bar
21,Italian Restaurant,Trail,Sandwich Place,Ice Cream Shop,Park,Theater,Museum,Farmers Market,Climbing Gym,Sushi Restaurant
22,Park,Yoga Studio,Hotel,Theater,Art Gallery,Plaza,Road,Pizza Place,Art Museum,Bookstore
23,Hotel,Café,Restaurant,Fast Food Restaurant,Lounge,Resort,Plaza,Cocktail Bar,Pizza Place,Shopping Mall
24,Hotel,Coffee Shop,Shopping Mall,Fast Food Restaurant,American Restaurant,Electronics Store,Bar,Pizza Place,Hotpot Restaurant,Department Store
32,BBQ Joint,Korean Restaurant,Park,Coffee Shop,Bakery,Historic Site,Hotel,Palace,Fried Chicken Joint,Japanese Restaurant
39,Café,Pakistani Restaurant,Shopping Mall,Coffee Shop,Fast Food Restaurant,Park,Burger Joint,Department Store,Italian Restaurant,Movie Theater


In [None]:
grouped_pivot=cluster_venues_5_df.pivot(index='drive-wheels',columns='body-style')
grouped_pivot

grouped_pivot=grouped_pivot.fillna(0) #fill missing values with 0
grouped_pivot

In [232]:
g = cluster_venues_1_df.groupby(['1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue', '4th Most Common Venue', '5th Most Common Venue', '6th Most Common Venue', '7th Most Common Venue', '8th Most Common Venue', '9th Most Common Venue', '10th Most Common Venue'])
g.count()

1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
Coffee Shop,Hotel,Shopping Mall,Park,Department Store,Trail,Train Station,Mountain,Plaza,Fast Food Restaurant
Hotel,Café,Historic Site,Italian Restaurant,Bakery,Sushi Restaurant,Lounge,Supermarket,Performing Arts Venue,Lebanese Restaurant
Hotel,Historic Site,History Museum,Coffee Shop,Chinese Restaurant,Fast Food Restaurant,Shopping Mall,Hostel,Plaza,Street Food Gathering
Hotel,Resort,African Restaurant,Restaurant,Bar,Café,Seafood Restaurant,Beach,Bakery,Ice Cream Shop
Hotel,Shopping Mall,Coffee Shop,Beach,Department Store,Bar,Aquarium,Café,Museum,Harbor / Marina
Hotel,Theater,Steakhouse,Park,Art Museum,Bookstore,Cocktail Bar,Boutique,Art Gallery,Plaza
Indian Restaurant,Bakery,Ice Cream Shop,Hotel,Coffee Shop,Diner,Café,BBQ Joint,Shopping Mall,Restaurant
Indian Restaurant,Fast Food Restaurant,Hotel,Shopping Mall,Café,Pizza Place,Bakery,Ice Cream Shop,Multiplex,Market
Indian Restaurant,Hotel,Café,Restaurant,Italian Restaurant,Monument / Landmark,Shopping Mall,Mediterranean Restaurant,Market,Bar
Park,Bakery,Coffee Shop,Steakhouse,Diner,Italian Restaurant,Pizza Place,Brewery,History Museum,Farmers Market
