In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl

In [2]:
## Create a dataframe of cities
City_data = {'City': ['New York', 'London','Edinburgh', 'Toronto', 'Sydney', 
                      'Singapore', 'Melbourne', 'Hong Kong', 'Los Angeles',
                      'Chicago', 'Boston', 'San Francisco', 'Dublin', 'Washington','Beijing',
                      'Shanghai','Guangzhou', 'Shenzhen', 'Mumbai', 'Tokyo', 'Seoul–Incheon','Moscow','Paris',
                      'Taipei', 'Berlin', 'Jakarta', 'Mexico City','Delhi','Kolkata']}
City_df = pd.DataFrame(City_data)

## add up columns of 'Lat', 'Lng', 'Country'
## For lar, and lng, we use zero values first for later data fill-in
City_df.insert(1, 'Latitude', np.zeros(29))
City_df.insert(2, 'Longitude', np.zeros(29))
City_df.insert(3, 'Country', ['US', 'UK', 'UK', 'Canada', 'Australia', 'Singapore', 'Australia', 'China',
                              'US', 'US', 'US', 'US', 'Ireland', 'US', 'China', 'China', 'China', 'China',
                              'India', 'Japan', 'South Korea', 'Russia', 'France', 'China', 'Germany', 'Indonesia', 'Mexico','India','India'])

In [3]:
City_df.iloc[0]

City         New York
Latitude            0
Longitude           0
Country            US
Name: 0, dtype: object

In [4]:
City_df.head(29)

Unnamed: 0,City,Latitude,Longitude,Country
0,New York,0.0,0.0,US
1,London,0.0,0.0,UK
2,Edinburgh,0.0,0.0,UK
3,Toronto,0.0,0.0,Canada
4,Sydney,0.0,0.0,Australia
5,Singapore,0.0,0.0,Singapore
6,Melbourne,0.0,0.0,Australia
7,Hong Kong,0.0,0.0,China
8,Los Angeles,0.0,0.0,US
9,Chicago,0.0,0.0,US


In [5]:
## Import necessary libraries
import geopy
from geopy.geocoders import Nominatim

## use geolocation package to retrieve location features (lat & lng) into the dataframe 
for index, row in City_df.iterrows():
    city = row['City']
    geolocator = Nominatim(user_agent = "explorer2")
    location_city = geolocator.geocode(str(city))
    lat_city = location_city.latitude
    lng_city = location_city.longitude
    City_df.loc[index, 'Latitude'] = lat_city
    City_df.loc[index, 'Longitude'] = lng_city
    
City_df.head(29)

Unnamed: 0,City,Latitude,Longitude,Country
0,New York,40.712728,-74.006015,US
1,London,51.507322,-0.127647,UK
2,Edinburgh,55.953346,-3.188375,UK
3,Toronto,43.653482,-79.383935,Canada
4,Sydney,-33.854816,151.216454,Australia
5,Singapore,1.340863,103.830392,Singapore
6,Melbourne,-37.814218,144.963161,Australia
7,Hong Kong,22.279328,114.162813,China
8,Los Angeles,34.053691,-118.242767,US
9,Chicago,41.875562,-87.624421,US


In [None]:
## Install relevant packages for visualization
!conda install -c conda-forge folium=0.5.0 --yes

Solving environment: / 

In [None]:
## import necessary lib
import folium

## create a world map
world_map = folium.Map()

## add location marks on the world map
for lati, lngi, city in zip(City_df['Latitude'], City_df['Longitude'], City_df['City']):
    label = '{}'.format(city)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lati, lngi],
        radius = 3,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.6,
        parse_html = False
    ).add_to(world_map)
    
world_map

In [None]:
## import necessary packages
import requests

## Client Information for Foursquare
CLIENT_ID = "YYZIJHKGABGQIHFD4SQH0RKMCD5E3JPUAIRCM1QLOANUILAU"
CLIENT_SECRET = "IQDDZ201VFA0XFRA2U1RKP30BDBYKL0XG42AXJ0LHKLTOPKX"
VERSION = '20190829'
LIMIT = 500

In [None]:
## Create a function to repeat process for all neighborhoods
def getNearbyVenues(names, latitudes, longitudes, radius=10000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        
        # create the API request URL
        url_city = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url_city).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'City Latitude', 
                  'City Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return nearby_venues

In [None]:
 ## Fill in the location inforation of cities into the function and return a s egregated dataframe of venues for all cities 
    world_venues = getNearbyVenues(names = City_df['City'], latitudes = City_df[ 'Latitude'], longitudes = City_df['Longitude']) 
    world_venues.head() 

In [None]:
## Check out the size of the dataset
world_venues.shape

In [None]:
## Apply onehot-coding to venue categories 
world_onehot = pd.get_dummies(world_venues['Venue Category'], prefix = "", prefix_sep= "")
world_onehot.head()

In [None]:
## Add city column back to dataframe
world_onehot[['City']] = world_venues[['City']]

# move city column to the first column
fixed_columns = [world_onehot.columns[-1]] + list(world_onehot.columns[:-1])
world_onehot_city = world_onehot[fixed_columns]

world_onehot_city.head()

In [None]:
## Group the dataset by the city names to check out the percentage of each venue categores 
world_grouped = world_onehot_city.groupby('City').mean().reset_index()
world_grouped.head()

In [None]:
## Define a function that sorts the values in rows

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
## create columns according to number of top venues
columns = ['City']
for ind in np.arange(10):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

## Create a dataframe
## set up the column names for the dataframe
City_venue_sorted = pd.DataFrame(columns = columns)

## set the column of "City" 
City_venue_sorted['City'] = world_grouped['City']

## Set the other column values -- the top 10 venue names
for ind in np.arange(world_grouped.shape[0]):
    City_venue_sorted.iloc[ind, 1:] = return_most_common_venues(world_grouped.iloc[ind, :], 10)

City_venue_sorted

In [None]:
## Import necessary lib
import pandas as pd
import requests
from bs4 import BeautifulSoup

## scrape datasets from website -- wikipedia page table
res = requests.get("https://en.wikipedia.org/wiki/List_of_cities_by_GDP")
soup = BeautifulSoup(res.content, 'lxml')
table = soup.find_all('table')[0]
City_GDP = pd.read_html(str(table))
City_GDP_dp = City_GDP[0][['City proper /Metropolitan area', 'Brookings Institution[5]2014 est.PPP-adjustedGDP ($BN)']]
City_GDP_dp.head()

In [None]:
## Change the column names for convenience
City_GDP_dp.columns = ['City', 'GDP']

## convert city names in world_grouped dataframe into a list
city_list = world_grouped['City'].tolist()

## Filter out data for relevant cities
gdp_filtered = []
for index, row in City_GDP_dp.iterrows():
    if row['City'] in city_list:
        gdp_filtered.append([row['City'], row['GDP']])

## print out city names that match >> turn out there are two cities that are not matched in the dataframe >> Seoul–Incheon & Washington
gdp_filtered

In [None]:
gdp_filtered[4]=['Delhi', '293.6']
gdp_filtered[5]=['Dublin', '90.1']
gdp_filtered[6]=['Edinburgh', '32.5']
gdp_filtered[7]=['Guangzhou', '380.3']
gdp_filtered[8]=['Hong Kong', '416.0']
gdp_filtered[9]=['Jakarta', '321.3']
gdp_filtered[10]=['Kolkata', '150.36']
gdp_filtered[11]=['London', '835.7']
gdp_filtered[12]=['Los Angeles', '860.5']
gdp_filtered[13]=['Melbourne', '178.4']
gdp_filtered[14]=['Mexico City', '403.6']
gdp_filtered[15]=['Moscow', '553.3']
gdp_filtered[16]=['Mumbai', '150.9']
gdp_filtered[17]=['New York', '1403.0']
gdp_filtered[18]=['Paris', '715.1']
gdp_filtered[19]=['San Francisco', '331.0']
gdp_filtered[20]=['Shanghai', '594.0']
gdp_filtered[21]=['Shenzhen', '363.2']
gdp_filtered[22]=['Singapore', '365.9']
gdp_filtered[23]=['Sydney', '223.4']
gdp_filtered[24]=['Taipei', '327.3']
gdp_filtered[25]=['Tokyo', '1617.0']
gdp_filtered.append(['Toronto','276.3'])
gdp_filtered.append(['Seoul','845.9'])
gdp_filtered.append(['Washington, DC','442.2'])

In [None]:
gdp_filtered

In [None]:

## Convert the list into a dataframe 
gdp_filtered_df = pd.DataFrame(gdp_filtered)
gdp_filtered_df.columns = ['City', 'GDP']

## drop repeated rows in the dataframe and convert GDP column into float
gdp_filtered_df = pd.DataFrame(gdp_filtered_df.drop_duplicates())
gdp_filtered_df['GDP'] = pd.to_numeric(gdp_filtered_df['GDP'])

In [None]:
gdp_filtered_df.shape

In [None]:
gdp_filtered_df.head(29)

In [None]:
## Rank Cities in GDP values and sort values
gdp_filtered_sorted = gdp_filtered_df.sort_values('GDP', ascending = False)
gdp_filtered_sorted.head(10)

In [None]:
## The last 5 cities in rank of GDP
gdp_filtered_sorted.tail(5)

In [None]:
## Visualize the ranking with a bar chart
gdp_visualize = gdp_filtered_sorted
gdp_visualize = gdp_visualize.set_index('City')
import matplotlib as mpl
gdp_visualize.plot(kind = 'barh',
                        figsize = (10, 10))

In [None]:
## reset index for GDP dataset
gdp_filtered_sorted1 = gdp_filtered_sorted.reset_index().drop(['index'], axis = 1)

In [None]:
## import necessary libs
from sklearn import preprocessing

## Standardize datasets
scaler = preprocessing.StandardScaler()
gdp_array = np.array(gdp_filtered_sorted['GDP'])
gdp_normalized_array = preprocessing.normalize([gdp_array])

## add the normalized gdp back into the dataframe
gdp_column = pd.DataFrame(gdp_normalized_array).transpose()
gdp_filtered_sorted1.insert(1, 'Normalized GDP', gdp_column)
gdp_filtered_sorted1.head()

In [None]:
## scrape datasets from website -- wikipedia page table
import requests
from bs4 import BeautifulSoup
import pandas as pd

## scrape data from the Wikipedia avergae temperature page
res1 = requests.get("https://en.wikipedia.org/wiki/List_of_cities_by_average_temperature")
soup1 = BeautifulSoup(res1.content, 'lxml')

## scrape Asia table
table_Asia = soup1.find_all('table')[1]
Asia_temp = pd.read_html(str(table_Asia))[0]

## scrape Europe table
table_Europe = soup1.find_all('table')[2]
Europe_temp = pd.read_html(str(table_Europe))[0]

## scrape America table
table_America = soup1.find_all('table')[3]
America_temp = pd.read_html(str(table_America))[0]

## scrape Australia table
table_Australia = soup1.find_all('table')[4]
Australia_temp = pd.read_html(str(table_Australia))[0]

Australia_temp.tail()

In [None]:
## set up a list to store relevant data
temp_list = []

## Filter out data for relevant cities >> in Asia
for index, row in Asia_temp.iterrows():
    if row['City'] in city_list:
        temp_list.append([row['City'], row['Year']])

## Filter out data for relevant cities >> in Europe
for index, row in Europe_temp.iterrows():
    if row['City'] in city_list:
        temp_list.append([row['City'], row['Year']])       
 
 ## Filter out data for relevant cities >> in America
for index, row in America_temp.iterrows():
    if row['City'] in city_list:
        temp_list.append([row['City'], row['Year']])  

## Filter out data for relevant cities >> in Australia
for index, row in Australia_temp.iterrows():
    if row['City'] in city_list:
        temp_list.append([row['City'], row['Year']])  

## check if data for all cities are successfully extracted
len(temp_list)

In [None]:
## check out which cities are missing
temp_list

In [None]:
## add up the missing cities
## Seoul
for index, row in Asia_temp.iterrows():
    if row['City'] in ['Seoul']:
        temp_list.append(['Seoul–Incheon', row['Year']])
        
## Washington,D.C., San Francisco, New York City
for index, row in America_temp.iterrows():
    if row['City'] in ['New York City' ]:
        temp_list.append(['New York', row['Year']])  
        
## Manually add up the rest from online sources
temp_list.append(['San Francisco', '14.6()'])
temp_list.append(['Washinton DC', '14.6()'])
temp_list.append(['Shenzhen', '22.9()'])
temp_list.append(['Guangzhou', '22.2()'])
temp_list.append(['Delhi', '29.2()'])
        
len(temp_list)

In [None]:
## convert temp_list into a dataframe
temp_df = pd.DataFrame(temp_list)
temp_df.columns = ['City', 'Temperature']

## drop out the F temp in the ()
for index, row in temp_df.iterrows():
    row['Temperature'] = row['Temperature'].split('(')[0]

## convert temperature values into int
temp_df['Temperature'] = pd.to_numeric(temp_df['Temperature'])

temp_df.head()

In [None]:
## Rank Cities in Temperature values and sort values
temp_sorted = temp_df.sort_values('Temperature', ascending = False)
temp_sorted.head(10)

In [None]:
## The last 10 cities in rank of tempature
temp_sorted.tail(10)

In [None]:
## Visualize the ranking with a bar chart
temp_visualize = temp_sorted
temp_visualize = temp_visualize.set_index('City')
temp_visualize.plot(kind = 'barh',
                   figsize = (10, 10))

In [None]:
## reset index for Temperature dataset
temp_sorted = temp_sorted.reset_index().drop(['index'], axis = 1)

In [None]:
## Standardize datasets
scaler_temp = preprocessing.StandardScaler()
temp_array = np.array(temp_sorted['Temperature'])
temp_normalized_array = preprocessing.normalize([temp_array])

## add the normalized gdp back into the dataframe
temp_column = pd.DataFrame(temp_normalized_array).transpose()
temp_sorted.insert(1, 'Normalized Temperature', temp_column)
temp_sorted.head()

In [None]:
## make sure the city names are the same
gdp_filtered_sorted1= gdp_filtered_sorted1.replace('Seoul-Incheon', 'Seoul')
gdp_filtered_sorted1 = gdp_filtered_sorted1.replace('Washington, DC', 'Washington')
temp_sorted = temp_sorted.replace('Washinton DC', 'Washington')
temp_sorted = temp_sorted.replace('Seoul–Incheon', 'Seoul')
world_grouped = world_grouped.replace('Seoul–Incheon', 'Seoul')

In [None]:
## merge GDP data
world_merged_cluster = world_grouped
world_merged_cluster = world_merged_cluster.join(gdp_filtered_sorted1.set_index('City'), on = 'City')

# merge Temperature data
world_merged_cluster = world_merged_cluster.join(temp_sorted.set_index('City'), on = 'City')
world_merged_cluster.head()

In [None]:
## Drop GDP and Temperature columns
world_merged_cluster = world_merged_cluster.drop(['GDP', 'Temperature'], axis = 1)
world_merged_cluster.head()

In [None]:
## 10 times to normalized GDP
world_merged_cluster['Normalized GDP'] = world_merged_cluster['Normalized GDP']*1.5
world_merged_cluster['Normalized Temperature'] = world_merged_cluster['Normalized Temperature']*1.5
world_merged_cluster

In [None]:
world_merged_cluster.drop_duplicates(subset='City',keep='last',inplace=True) 

In [None]:
world_merged_cluster

In [None]:
## Drop out the city column of the grouped data for model training
world_grouped_clustering = world_merged_cluster.drop(['City'], axis = 1)

## import machine learning packages
import sklearn
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.cluster import KMeans
distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(world_grouped_clustering)
    distortions.append(kmeanModel.inertia_)
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
## Drop out the city column of the grouped data for model training
world_grouped_clustering = world_merged_cluster.drop(['City'], axis = 1)

## import machine learning packages
import sklearn
from sklearn.cluster import KMeans

## Create and fit a kmeans model 
model_kmeans = KMeans(n_clusters = 7, random_state = 0)
model_kmeans.fit(world_grouped_clustering)

## Check out the labels
kmeans_labels = model_kmeans.labels_
kmeans_labels

In [None]:
City_venue_sorted.insert(0, 'Cluster Labels', kmeans_labels)
City_venue_sorted.head()

In [None]:
## Check out the shape of the City_venue_sorted
City_venue_sorted.shape

In [None]:
City_df.shape

In [None]:
## Since the two dataframes have the same shape, we can merge them on the Postal Code
City_venue_sorted = City_venue_sorted.replace('Seoul–Incheon', 'Seoul')
world_merged = City_df
world_merged = world_merged.replace('Seoul–Incheon', 'Seoul')
world_merged = world_merged.join(City_venue_sorted.set_index('City'), on = 'City')

world_merged

In [None]:
## import necessary lib and packages
import matplotlib.cm as cm
import matplotlib.colors as colors

## Create map
map_clusters = folium.Map()

## set color scheme for the clusters
x = np.arange(6)
ys = [i + x + (i*x)**2 for i in range(6)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, city, country,cluster in zip(world_merged['Latitude'], world_merged['Longitude'], world_merged['City'], world_merged['Country'],world_merged['Cluster Labels']):
    label = folium.Popup(str(city) + ',' + str(country) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=3,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters

In [None]:
## Filter out the cluster 0 cities and change the column name to cluster 0
Cluster0 = pd.DataFrame(world_merged[world_merged['Cluster Labels'] == 0][['City', 'Country']])
Cluster0.columns = ['City', 'Country']
Cluster0 = pd.DataFrame(world_merged[world_merged['Cluster Labels'] == 0][['City', 'Country']]).replace('Seoul–Incheon','Seoul')
Cluster0.columns = ['City', 'Country']
Cluster0 = Cluster0.join(City_venue_sorted.set_index('City'), on = 'City')
Cluster0 = Cluster0.join(gdp_filtered_sorted1.set_index('City'), on = 'City')
Cluster0 = Cluster0.join(temp_sorted.set_index('City'), on = 'City')
Cluster0

This cluster has 4 cities, There Cafe,coffee shops,Hotel,Bakery and Pub the most famous.

In [None]:
## Filter out the cluster 1 cities and change the column name to cluster 1
City_venue_sorted = City_venue_sorted.replace('Seoul–Incheon', 'Seoul')
Cluster1 = pd.DataFrame(world_merged[world_merged['Cluster Labels'] == 1][['City', 'Country']]).replace('Seoul–Incheon','Seoul')
Cluster1.columns = ['City', 'Country']
Cluster1 = Cluster1.join(City_venue_sorted.set_index('City'), on = 'City')
Cluster1 = Cluster1.join(gdp_filtered_sorted1.set_index('City'), on = 'City')
Cluster1 = Cluster1.join(temp_sorted.set_index('City'), on = 'City')
Cluster1

This cluster has 7 cities, There coffee shops,Bakery,Pub and Cafe are the most famous.

In [None]:
## Filter out the cluster 2 cities and change the column name to cluster 2
Cluster2 = pd.DataFrame(world_merged[world_merged['Cluster Labels'] == 2][['City', 'Country']])
Cluster2.columns = ['City', 'Country']
Cluster2 = pd.DataFrame(world_merged[world_merged['Cluster Labels'] == 2][['City', 'Country']]).replace('Seoul–Incheon','Seoul')
Cluster2.columns = ['City', 'Country']
Cluster2 = Cluster2.join(City_venue_sorted.set_index('City'), on = 'City')
Cluster2 = Cluster2.join(gdp_filtered_sorted1.set_index('City'), on = 'City')
Cluster2 = Cluster2.join(temp_sorted.set_index('City'), on = 'City')
Cluster2

These 6 cities are from all across the world. One common feature among them is that theaters are pretty popular in these cities. London, Chicago, and Moscow are four out of three cities among all with theaters in the top 3 most common venues, and they all have developed arts and entertainment industries. Besides, Hotels are popular in these cities and all 6 cities all have close GDP, which is nearly 2 times higher than that of cluster 0 cities. However, their climates are pretty different.

In [None]:
## Filter out the cluster 3 cities and change the column name to cluster 3
Cluster3 = pd.DataFrame(world_merged[world_merged['Cluster Labels'] == 3][['City', 'Country']])
Cluster3.columns = ['City', 'Country']
Cluster3 = pd.DataFrame(world_merged[world_merged['Cluster Labels'] == 3][['City', 'Country']]).replace('Seoul–Incheon','Seoul')
Cluster3.columns = ['City', 'Country']
Cluster3 = Cluster3.join(City_venue_sorted.set_index('City'), on = 'City')
Cluster3 = Cluster3.join(gdp_filtered_sorted1.set_index('City'), on = 'City')
Cluster3 = Cluster3.join(temp_sorted.set_index('City'), on = 'City')
Cluster3

In this cluster has only 2 cities .These two cities have very different culture so setting business their will be having individual risks

In [None]:
## Filter out the cluster 4 cities and change the column name to cluster 4
Cluster4 = pd.DataFrame(world_merged[world_merged['Cluster Labels'] == 4][['City', 'Country']])
Cluster4.columns = ['City', 'Country']
Cluster4 = pd.DataFrame(world_merged[world_merged['Cluster Labels'] == 4][['City', 'Country']]).replace('Seoul–Incheon','Seoul')
Cluster4 = Cluster4.join(City_venue_sorted.set_index('City'), on = 'City')
Cluster4 = Cluster4.join(gdp_filtered_sorted1.set_index('City'), on = 'City')
Cluster4 = Cluster4.join(temp_sorted.set_index('City'), on = 'City')
Cluster4

in this cluster, 5 cities from China out of 7. Their popular venues include Hotel,Coffee shops, Chinese Restaurant, which shows people in these cities are enjoying a rather slowly-paced life. These cities have a similar GDP too, which is slightly higher than that of cluster 0

In [None]:
## Filter out the cluster 5 cities and change the column name to cluster 5
Cluster5 = pd.DataFrame(world_merged[world_merged['Cluster Labels'] == 5][['City', 'Country']])
Cluster5.columns = ['City', 'Country']
Cluster5 = pd.DataFrame(world_merged[world_merged['Cluster Labels'] == 5][['City', 'Country']]).replace('Seoul–Incheon','Seoul')
Cluster5 = Cluster5.join(City_venue_sorted.set_index('City'), on = 'City')
Cluster5 = Cluster5.join(gdp_filtered_sorted1.set_index('City'), on = 'City')
Cluster5 = Cluster5.join(temp_sorted.set_index('City'), on = 'City')
Cluster5

These 3 cities are all located in the southern Asian areas and in India with similar climates and temperatures. Their GDP are close too and are lower than those of the cluster 1 cities. 6 of them have hotel as the most common venue and coffee shops/cafe are very popular too. This shows that tourism might be an essential source of income for these cities

In [None]:
## Filter out the cluster 6 cities and change the column name to cluster 6
Cluster6 = pd.DataFrame(world_merged[world_merged['Cluster Labels'] == 6][['City', 'Country']])
Cluster6.columns = ['City', 'Country']
Cluster6 = pd.DataFrame(world_merged[world_merged['Cluster Labels'] == 6][['City', 'Country']]).replace('Seoul–Incheon','Seoul')
Cluster6 = Cluster6.join(City_venue_sorted.set_index('City'), on = 'City')
Cluster6 = Cluster6.join(gdp_filtered_sorted1.set_index('City'), on = 'City')
Cluster6 = Cluster6.join(temp_sorted.set_index('City'), on = 'City')
Cluster6

In this cluster has only two cities one from US and other from korean country having totally different lifestyles.There coffee shops ,Bakery and Fast food Restaurant are famous and can be established in order to gain profit if qualities are much better than others.