# Finding the most suitable european cities to immigrate to
### Clustering the cities based on several geographic and socio-economic criteria

## 0- Import libraries

In [679]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import requests
%matplotlib inline

## 1- Defining the candidate Cities

The European candidate cities were calculated based on three criteria :
- The population of the city
- The living's standard of the country where the city is situated
- The willingness to receive immigrants

### 1-1: population of European cities

#### *Load the data*

In [680]:
#data source : https://worldpopulationreview.com/continents/cities-in-europe
df_pop=pd.read_csv(r'data/data_european_cities_population.csv')
df_pop.head()

Unnamed: 0,asciiname,country,population
0,Moscow,Russia,10381222
1,London,United Kingdom,7556900
2,Saint Petersburg,Russia,5028000
3,Berlin,Germany,3426354
4,Madrid,Spain,3255944


#### *change the name of city's column*

In [683]:
df_pop.rename(columns={'asciiname':'city'},inplace=True)
df_pop.head()         

Unnamed: 0,city,country,population
0,Moscow,Russia,10381222
1,London,United Kingdom,7556900
2,Saint Petersburg,Russia,5028000
3,Berlin,Germany,3426354
4,Madrid,Spain,3255944


#### *Show the number of cities by country*

In [684]:
df_pop_byCountry = df_pop.groupby(['country']).count().sort_values(by='city', ascending=False)[0:10]
df_pop_byCountry

Unnamed: 0_level_0,city,population
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Russia,127,127
Germany,60,60
United Kingdom,56,56
Spain,52,52
Ukraine,34,34
Poland,25,25
Romania,20,20
Italy,19,19
France,18,18
Netherlands,10,10


###	1-2 : Standard of living

#### *Load the data*

In [685]:
url_gdp='https://en.wikipedia.org/wiki/List_of_sovereign_states_in_Europe_by_GDP_(PPP)_per_capita'

In [686]:
df_url1=pd.read_html(url_gdp)
len(df_url1)

2

In [687]:
df_gdp=df_url1[1]
df_gdp.head()

Unnamed: 0,Rank,Country,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,1,Luxembourg,90661,92969,92102,94823,99738,101255,103286,106373,110870,112622
1,2,Ireland,43291,45359,46058,47422,52133,65656,69248,75538,79925,81686
2,3,Norway,61601,62656,64699,65673,67377,68795,69807,71830,74065,76620
3,4,Switzerland,53263,54769,55728,57098,58808,59423,60365,61421,63380,66780
4,5,Netherlands,44839,46309,46491,47015,48363,49780,51248,53634,56435,59105


#### *organize the data table*

In [688]:
df_gdp.drop(['Rank','2010','2011','2012','2013','2014','2015','2016','2017','2018'],axis=1, inplace=True)
df_gdp.rename(columns={'Country':'country','2019':'gdp'},inplace=True)
df_gdp.head()

Unnamed: 0,country,gdp
0,Luxembourg,112622
1,Ireland,81686
2,Norway,76620
3,Switzerland,66780
4,Netherlands,59105


###	1-3 : willingness to receive immigrants

#### *Load the data*

In [689]:
url_img='https://en.wikipedia.org/wiki/Immigration_to_Europe#2013_data'

In [690]:
df_url=pd.read_html(url_img)
len(df_url)

8

In [691]:
df_img=df_url[2]
df_img.head()

Unnamed: 0,Country,Number of immigrants,Percentage oftotal number ofimmigrantsin the world,Immigrants aspercentage ofnational population
0,Russia,11048064,4.8,7.7
1,Germany,9845244,4.3,11.9
2,United Kingdom,7824131,3.4,12.4
3,France,7439086,3.2,11.6
4,Spain,5891208,2.8,9.6 (2016)


#### *organize the table data*

In [692]:
df_img['pc_immigrants']=df_img.iloc[:,3]
df_img.head()

Unnamed: 0,Country,Number of immigrants,Percentage oftotal number ofimmigrantsin the world,Immigrants aspercentage ofnational population,pc_immigrants
0,Russia,11048064,4.8,7.7,7.7
1,Germany,9845244,4.3,11.9,11.9
2,United Kingdom,7824131,3.4,12.4,12.4
3,France,7439086,3.2,11.6,11.6
4,Spain,5891208,2.8,9.6 (2016),9.6 (2016)


In [693]:
df_img['pc_immigrants'] = df_img['pc_immigrants'].str.replace('(\(\d\d\d\d\))', '')
df_img=df_img.iloc[:,[0,1,4]]
df_img = df_img.astype({"pc_immigrants": float})
df_img.rename(columns={'Country':'country'},inplace=True)
df_img.head()

Unnamed: 0,country,Number of immigrants,pc_immigrants
0,Russia,11048064,7.7
1,Germany,9845244,11.9
2,United Kingdom,7824131,12.4
3,France,7439086,11.6
4,Spain,5891208,9.6


### 1-4 : Combining the three criteria

#### *merge the table of the three criteria*

In [694]:
df=pd.merge(df_pop,df_gdp)
df=pd.merge(df,df_img)
df.head()

Unnamed: 0,city,country,population,gdp,Number of immigrants,pc_immigrants
0,Moscow,Russia,10381222,30682,11048064,7.7
1,Saint Petersburg,Russia,5028000,30682,11048064,7.7
2,Novosibirsk,Russia,1419007,30682,11048064,7.7
3,Yekaterinburg,Russia,1349772,30682,11048064,7.7
4,Nizhniy Novgorod,Russia,1284164,30682,11048064,7.7


#### *Caculate the score of each city*

##### Data normalization

In [695]:
from sklearn import preprocessing

In [696]:
Coln=df[['population','gdp','Number of immigrants','pc_immigrants']]

In [697]:
Coln= preprocessing.MinMaxScaler().fit(Coln).transform(Coln)
Coln

array([[1.00000000e+00, 3.12151045e-01, 1.00000000e+00, 2.50883392e-01],
       [4.76456687e-01, 3.12151045e-01, 1.00000000e+00, 2.50883392e-01],
       [1.23498384e-01, 3.12151045e-01, 1.00000000e+00, 2.50883392e-01],
       ...,
       [8.06847228e-04, 7.98975051e-01, 2.09695228e-01, 1.00000000e+00],
       [1.13430006e-02, 4.22184761e-01, 1.90565564e-02, 3.78091873e-01],
       [4.32411435e-03, 4.59015509e-01, 1.67000654e-02, 6.21908127e-01]])

In [698]:
dfn = pd.DataFrame(Coln,columns=["pop_n", "gdp_n","nb_immigrants_n","pc_immigrants_n"])

In [699]:
df["pop_n"]=dfn["pop_n"]
df["gdp_n"]=dfn["gdp_n"]
df["pc_immigrants_n"]=dfn["pc_immigrants_n"]

#### adding an adjusting factor of language
Africans often migrate to countries that have occupied their countries in the past

In [700]:
df['language']= 0
df.loc[df['country'] == 'France', 'language'] = 0.15
df.loc[df['country'] == 'United Kingdom', 'language'] = 0.1
df.loc[df['country'] == 'Spain', 'language'] = 0.15
df.loc[df['country'] == 'Italy', 'language'] = 0.15
df.loc[df['country'] == 'Belgium', 'language'] = 0.12

#### add and caculate the score column

In [701]:
df["score"]=(df["pop_n"]+df["gdp_n"]+df["pc_immigrants_n"]+df['language'])/4
df.head()

Unnamed: 0,city,country,population,gdp,Number of immigrants,pc_immigrants,pop_n,gdp_n,pc_immigrants_n,language,score
0,Moscow,Russia,10381222,30682,11048064,7.7,1.0,0.312151,0.250883,0.0,0.390759
1,Saint Petersburg,Russia,5028000,30682,11048064,7.7,0.476457,0.312151,0.250883,0.0,0.259873
2,Novosibirsk,Russia,1419007,30682,11048064,7.7,0.123498,0.312151,0.250883,0.0,0.171633
3,Yekaterinburg,Russia,1349772,30682,11048064,7.7,0.116727,0.312151,0.250883,0.0,0.16994
4,Nizhniy Novgorod,Russia,1284164,30682,11048064,7.7,0.110311,0.312151,0.250883,0.0,0.168336


###	1-4 : exctracting the 100's top cities

In [702]:
topCities=df.sort_values(by='score', ascending=False)[0:100]
topCities

Unnamed: 0,city,country,population,gdp,Number of immigrants,pc_immigrants,pop_n,gdp_n,pc_immigrants_n,language,score
493,Zurich,Switzerland,341730,66780,2335059,28.9,0.018141,0.798975,1.000000,0.0,0.454279
494,Geneve,Switzerland,183981,66780,2335059,28.9,0.002713,0.798975,1.000000,0.0,0.450422
495,Basel,Switzerland,164488,66780,2335059,28.9,0.000807,0.798975,1.000000,0.0,0.449945
127,London,United Kingdom,7556900,47042,7824131,12.4,0.723782,0.532785,0.416961,0.1,0.443382
447,Dublin,Ireland,1024027,81686,735535,15.9,0.084869,1.000000,0.540636,0.0,0.406376
...,...,...,...,...,...,...,...,...,...,...,...
152,Derby,United Kingdom,270468,47042,7824131,12.4,0.011172,0.532785,0.416961,0.1,0.265229
440,Copenhagen,Denmark,1153615,54564,556825,9.9,0.097543,0.634228,0.328622,0.0,0.265098
153,Plymouth,United Kingdom,260203,47042,7824131,12.4,0.010168,0.532785,0.416961,0.1,0.264978
154,Luton,United Kingdom,258018,47042,7824131,12.4,0.009954,0.532785,0.416961,0.1,0.264925


#### *show the ditribution of top citites by country*

In [703]:
topCities_byCountry = topCities.groupby(['country']).count()
topCities_byCountry

Unnamed: 0_level_0,city,population,gdp,Number of immigrants,pc_immigrants,pop_n,gdp_n,pc_immigrants_n,language,score
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Austria,6,6,6,6,6,6,6,6,6,6
Belgium,2,2,2,2,2,2,2,2,2,2
Cyprus,1,1,1,1,1,1,1,1,1,1
Denmark,1,1,1,1,1,1,1,1,1,1
France,18,18,18,18,18,18,18,18,18,18
Germany,19,19,19,19,19,19,19,19,19,19
Ireland,3,3,3,3,3,3,3,3,3,3
Italy,1,1,1,1,1,1,1,1,1,1
Netherlands,10,10,10,10,10,10,10,10,10,10
Norway,2,2,2,2,2,2,2,2,2,2


In [704]:
topCities=topCities[['city','country','population','gdp','pc_immigrants']]
topCities.head()

Unnamed: 0,city,country,population,gdp,pc_immigrants
493,Zurich,Switzerland,341730,66780,28.9
494,Geneve,Switzerland,183981,66780,28.9
495,Basel,Switzerland,164488,66780,28.9
127,London,United Kingdom,7556900,47042,12.4
447,Dublin,Ireland,1024027,81686,15.9


#### *adding longitude and latitude to cities*

In [823]:
# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim
import warnings
warnings.filterwarnings('ignore')

In [824]:
# create the address column
topCities['adress']=topCities['city']+ ", "+topCities['country']
topCities.head()

Unnamed: 0,city,country,population,gdp,pc_immigrants,adress,long,lat
493,Zurich,Switzerland,341730,66780,28.9,"Zurich, Switzerland",8.542333,47.372394
494,Geneve,Switzerland,183981,66780,28.9,"Geneve, Switzerland",6.146601,46.201756
495,Basel,Switzerland,164488,66780,28.9,"Basel, Switzerland",7.587826,47.558108
127,London,United Kingdom,7556900,47042,12.4,"London, United Kingdom",-0.127647,51.507322
447,Dublin,Ireland,1024027,81686,15.9,"Dublin, Ireland",-6.260273,53.349764


In [744]:
#Function that calculates the long and lat coordinates from adress
geolocator = Nominatim(user_agent="myexplorer")
def longlat_adr(adr):
    location = geolocator.geocode(adr, timeout=3)
    longitude = location.longitude
    latitude = location.latitude
    return(longitude,latitude)

In [825]:
topCities['long'] = topCities['adress'].apply(lambda x: longlat_adr(x)[0])
topCities['lat'] = topCities['adress'].apply(lambda x: longlat_adr(x)[1])
topCities.head()

Unnamed: 0,city,country,population,gdp,pc_immigrants,adress,long,lat
493,Zurich,Switzerland,341730,66780,28.9,"Zurich, Switzerland",8.542333,47.372394
494,Geneve,Switzerland,183981,66780,28.9,"Geneve, Switzerland",6.146601,46.201756
495,Basel,Switzerland,164488,66780,28.9,"Basel, Switzerland",7.587826,47.558108
127,London,United Kingdom,7556900,47042,12.4,"London, United Kingdom",-0.127647,51.507322
447,Dublin,Ireland,1024027,81686,15.9,"Dublin, Ireland",-6.260273,53.349764


In [747]:
topCities

Unnamed: 0,city,country,population,gdp,pc_immigrants,adress,long,lat
493,Zurich,Switzerland,341730,66780,28.9,"Zurich, Switzerland",8.542333,47.372394
494,Geneve,Switzerland,183981,66780,28.9,"Geneve, Switzerland",6.146601,46.201756
495,Basel,Switzerland,164488,66780,28.9,"Basel, Switzerland",7.587826,47.558108
127,London,United Kingdom,7556900,47042,12.4,"London, United Kingdom",-0.127647,51.507322
447,Dublin,Ireland,1024027,81686,15.9,"Dublin, Ireland",-6.260273,53.349764
...,...,...,...,...,...,...,...,...
152,Derby,United Kingdom,270468,47042,12.4,"Derby, United Kingdom",-1.476149,52.921262
440,Copenhagen,Denmark,1153615,54564,9.9,"Copenhagen, Denmark",12.570072,55.686724
153,Plymouth,United Kingdom,260203,47042,12.4,"Plymouth, United Kingdom",-4.142566,50.371266
154,Luton,United Kingdom,258018,47042,12.4,"Luton, United Kingdom",-0.415284,51.878439


In [748]:
#setting a city name as the index for topcities table and save it new dataframe (dfc)
dfc=topCities.set_index('city')
dfc.head()

Unnamed: 0_level_0,country,population,gdp,pc_immigrants,adress,long,lat
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Zurich,Switzerland,341730,66780,28.9,"Zurich, Switzerland",8.542333,47.372394
Geneve,Switzerland,183981,66780,28.9,"Geneve, Switzerland",6.146601,46.201756
Basel,Switzerland,164488,66780,28.9,"Basel, Switzerland",7.587826,47.558108
London,United Kingdom,7556900,47042,12.4,"London, United Kingdom",-0.127647,51.507322
Dublin,Ireland,1024027,81686,15.9,"Dublin, Ireland",-6.260273,53.349764


###	1-5 : visualizing the 100's top cities

In [749]:
# map rendering library
import folium 

#### *Create a map of top cities*

In [848]:
# create map of Europe centered on median point of cities
long_c=dfc['long'].median()
lat_c=dfc['lat'].median()
map_europe = folium.Map(location=[lat_c, long_c], zoom_start=4)

# add markers to map
for lat, long, city in zip(dfc['lat'], dfc['long'], dfc['adress']):
    label = '{}'.format(city)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_europe)  
    
map_europe

# 2- Using Foursquare API to get the venues of each city

In [725]:
#Foursquare Credentials
CLIENT_ID = 'YZB3VJMI22ZJTC2QTHUI4RR0XMW2ULDKQIW151NEQJM4MU5R'
CLIENT_SECRET = '535ILXEU4MOZBEPTXWD3RX4FWI0S5RXU0ZOBBSNQHGADJL4D'
VERSION = '20200101'

## 2-1 defining the categories of venues 

In [711]:
categ={
        'University' : '4bf58dd8d48988d1ae941735',
        'Hospital' : '4bf58dd8d48988d196941735',
        'Business Center' : '56aa371be4b08b9a8d573517',
        'Cultural Center' : '52e81612bcbc57f1066b7a32',
        'Embassy Consulate' : '4bf58dd8d48988d12c951735',
        'Bank' : '4bf58dd8d48988d10a951735',
        'Train Station' : '4bf58dd8d48988d129951735',
        'Gym Center' : '4bf58dd8d48988d175941735'
    }

## 2-2 getting the venues for the cities

In [712]:
# the exploring of venues is done based on the city's adress using the parameter 'near'
def count_categ_city(city_adr,categ_id):
    LIMIT = 300 # limit of number of venues returned by Foursquare API
# create URL
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&limit={}&near={}&categoryId={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION,
        LIMIT,
        city_adr,
        categ_id
    )
    results = requests.get(url).json()
    try :
        return results['response']['totalResults']
    except:
        return 0

In [713]:
cities_adr=dfc['adress'].tolist()

In [726]:
#function to get the number of venues for each city
dv=dict()
for city_adr in cities_adr:
    city=city_adr.split(',')[0]
    dv[city]=[]
    for category in categ.keys():
        dv[city].append((count_categ_city(city,categ[category])))

In [751]:
#converting the results to dataframe
df_data=pd.DataFrame.from_dict(dv, orient='index',columns=list(categ.keys()))
df_data.head()

Unnamed: 0,University,Hospital,Business Center,Cultural Center,Embassy Consulate,Bank,Train Station,Gym Center
Zurich,32,33,4,10,11,66,79,88
Geneve,17,17,11,5,5,58,18,44
Basel,9,12,11,8,8,44,26,26
London,147,122,45,18,97,185,257,217
Dublin,54,15,10,1,13,46,12,79


## 2-3 Merging the venues properties with the socio-economic caracteristics

In [754]:
df_data=df_data.reset_index()
df_data.rename(columns={'index':'city'},inplace=True)
dfdata=pd.merge(df_data,dfc)

Unnamed: 0,city,University,Hospital,Business Center,Cultural Center,Embassy Consulate,Bank,Train Station,Gym Center
0,Zurich,32,33,4,10,11,66,79,88
1,Geneve,17,17,11,5,5,58,18,44
2,Basel,9,12,11,8,8,44,26,26
3,London,147,122,45,18,97,185,257,217
4,Dublin,54,15,10,1,13,46,12,79


In [756]:
dfdata=pd.merge(df_data,topCities)
dfdata.head()

Unnamed: 0,city,University,Hospital,Business Center,Cultural Center,Embassy Consulate,Bank,Train Station,Gym Center,country,population,gdp,pc_immigrants,adress,long,lat
0,Zurich,32,33,4,10,11,66,79,88,Switzerland,341730,66780,28.9,"Zurich, Switzerland",8.542333,47.372394
1,Geneve,17,17,11,5,5,58,18,44,Switzerland,183981,66780,28.9,"Geneve, Switzerland",6.146601,46.201756
2,Basel,9,12,11,8,8,44,26,26,Switzerland,164488,66780,28.9,"Basel, Switzerland",7.587826,47.558108
3,London,147,122,45,18,97,185,257,217,United Kingdom,7556900,47042,12.4,"London, United Kingdom",-0.127647,51.507322
4,Dublin,54,15,10,1,13,46,12,79,Ireland,1024027,81686,15.9,"Dublin, Ireland",-6.260273,53.349764


# 3- Clustering cities

### Organizing the dataframe of cities containing the main features to use for clustering

In [757]:
dfdata.columns

Index(['city', 'University', 'Hospital', 'Business Center', 'Cultural Center',
       'Embassy Consulate', 'Bank', 'Train Station', 'Gym Center', 'country',
       'population', 'gdp', 'pc_immigrants', 'adress', 'long', 'lat'],
      dtype='object')

In [758]:
data_cl=dfdata[['city', 'University', 'Hospital', 'Business Center', 'Cultural Center',
       'Embassy Consulate', 'Bank', 'Train Station', 'Gym Center', 'population', 'gdp', 'pc_immigrants']]
data_cl.head()

Unnamed: 0,city,University,Hospital,Business Center,Cultural Center,Embassy Consulate,Bank,Train Station,Gym Center,population,gdp,pc_immigrants
0,Zurich,32,33,4,10,11,66,79,88,341730,66780,28.9
1,Geneve,17,17,11,5,5,58,18,44,183981,66780,28.9
2,Basel,9,12,11,8,8,44,26,26,164488,66780,28.9
3,London,147,122,45,18,97,185,257,217,7556900,47042,12.4
4,Dublin,54,15,10,1,13,46,12,79,1024027,81686,15.9


In [759]:
data_cl.set_index('city',inplace=True)
data_cl.head()

Unnamed: 0_level_0,University,Hospital,Business Center,Cultural Center,Embassy Consulate,Bank,Train Station,Gym Center,population,gdp,pc_immigrants
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Zurich,32,33,4,10,11,66,79,88,341730,66780,28.9
Geneve,17,17,11,5,5,58,18,44,183981,66780,28.9
Basel,9,12,11,8,8,44,26,26,164488,66780,28.9
London,147,122,45,18,97,185,257,217,7556900,47042,12.4
Dublin,54,15,10,1,13,46,12,79,1024027,81686,15.9


### Data Normalization

In [799]:
data_cln= preprocessing.MinMaxScaler().fit(data_cl).transform(data_cl)
df_data_cl = pd.DataFrame(data_cln,index=list_city , columns=['University', 'Hospital', 'Business Center', 'Cultural Center',
       'Embassy Consulate', 'Bank', 'Train Station', 'Gym Center', 'population', 'gdp', 'pc_immigrants'])
df_data_cl.head()

Unnamed: 0,University,Hospital,Business Center,Cultural Center,Embassy Consulate,Bank,Train Station,Gym Center,population,gdp,pc_immigrants
Zurich,0.217687,0.130631,0.008621,0.140845,0.084615,0.234875,0.307393,0.389381,0.017919,0.707748,1.0
Geneve,0.115646,0.058559,0.038793,0.070423,0.038462,0.206406,0.070039,0.19469,0.002488,0.707748,1.0
Basel,0.061224,0.036036,0.038793,0.112676,0.061538,0.156584,0.101167,0.115044,0.000581,0.707748,1.0
London,1.0,0.531532,0.185345,0.253521,0.746154,0.658363,1.0,0.960177,0.72372,0.320759,0.221698
Dublin,0.367347,0.04955,0.034483,0.014085,0.1,0.163701,0.046693,0.349558,0.084662,1.0,0.386792


### Applying the k-means Clustering

In [785]:
# import k-means from clustering
from sklearn.cluster import KMeans

In [786]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_data_cl)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 4, 3, 3, 4, 3, 0, 3, 0, 0, 0, 0, 3,
       0, 3, 3, 0, 0, 3, 3, 4, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0,
       0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 3, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0,
       0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 0, 0])

In [800]:
df_data_cl['Cluster']=kmeans.labels_
df_data_cl.head()

Unnamed: 0,University,Hospital,Business Center,Cultural Center,Embassy Consulate,Bank,Train Station,Gym Center,population,gdp,pc_immigrants,Cluster
Zurich,0.217687,0.130631,0.008621,0.140845,0.084615,0.234875,0.307393,0.389381,0.017919,0.707748,1.0,2
Geneve,0.115646,0.058559,0.038793,0.070423,0.038462,0.206406,0.070039,0.19469,0.002488,0.707748,1.0,2
Basel,0.061224,0.036036,0.038793,0.112676,0.061538,0.156584,0.101167,0.115044,0.000581,0.707748,1.0,2
London,1.0,0.531532,0.185345,0.253521,0.746154,0.658363,1.0,0.960177,0.72372,0.320759,0.221698,1
Dublin,0.367347,0.04955,0.034483,0.014085,0.1,0.163701,0.046693,0.349558,0.084662,1.0,0.386792,2


### Visualizing the resulting clusters

In [801]:
df_data_cl.reset_index(inplace=True)
df_data_cl.rename(columns={'index':'city'},inplace=True)


Unnamed: 0,city,University,Hospital,Business Center,Cultural Center,Embassy Consulate,Bank,Train Station,Gym Center,population,gdp,pc_immigrants,Cluster
0,Zurich,0.217687,0.130631,0.008621,0.140845,0.084615,0.234875,0.307393,0.389381,0.017919,0.707748,1.0,2
1,Geneve,0.115646,0.058559,0.038793,0.070423,0.038462,0.206406,0.070039,0.19469,0.002488,0.707748,1.0,2
2,Basel,0.061224,0.036036,0.038793,0.112676,0.061538,0.156584,0.101167,0.115044,0.000581,0.707748,1.0,2
3,London,1.0,0.531532,0.185345,0.253521,0.746154,0.658363,1.0,0.960177,0.72372,0.320759,0.221698,1
4,Dublin,0.367347,0.04955,0.034483,0.014085,0.1,0.163701,0.046693,0.349558,0.084662,1.0,0.386792,2


In [839]:
dfdatacl=pd.merge(df_data_cl,topCities[['city','long','lat','country']],on='city')
dfdatacl.head()

Unnamed: 0,city,University,Hospital,Business Center,Cultural Center,Embassy Consulate,Bank,Train Station,Gym Center,population,gdp,pc_immigrants,Cluster,long,lat,country
0,Zurich,0.217687,0.130631,0.008621,0.140845,0.084615,0.234875,0.307393,0.389381,0.017919,0.707748,1.0,2,8.542333,47.372394,Switzerland
1,Geneve,0.115646,0.058559,0.038793,0.070423,0.038462,0.206406,0.070039,0.19469,0.002488,0.707748,1.0,2,6.146601,46.201756,Switzerland
2,Basel,0.061224,0.036036,0.038793,0.112676,0.061538,0.156584,0.101167,0.115044,0.000581,0.707748,1.0,2,7.587826,47.558108,Switzerland
3,London,1.0,0.531532,0.185345,0.253521,0.746154,0.658363,1.0,0.960177,0.72372,0.320759,0.221698,1,-0.127647,51.507322,United Kingdom
4,Dublin,0.367347,0.04955,0.034483,0.014085,0.1,0.163701,0.046693,0.349558,0.084662,1.0,0.386792,2,-6.260273,53.349764,Ireland


In [847]:
# create map
map_clusters = folium.Map(location=[lat_c, long_c], zoom_start=4)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, city, cluster in zip(dfdatacl['lat'], dfdatacl['long'], dfdatacl['city'], dfdatacl['Cluster']):
    label = folium.Popup(str(city) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# 4- Resulting Clusters

### List of Cities of Cluster 4

In [840]:
dfdatacl.loc[dfdatacl['Cluster'] == 4][['city','country']]

Unnamed: 0,city,country
10,Berlin,Germany
13,Paris,France
29,The Hague,Netherlands
86,Bradford,United Kingdom


### List of Cities of Cluster 3

In [841]:
dfdatacl.loc[dfdatacl['Cluster'] == 3][['city','country']]

Unnamed: 0,city,country
11,Vienna,Austria
12,Stockholm,Sweden
14,Madrid,Spain
16,Hamburg,Germany
21,Donaustadt,Austria
23,Munich,Germany
24,Amsterdam,Netherlands
27,Rome,Italy
28,Brussels,Belgium
32,Liverpool,United Kingdom


### List of Cities of Cluster 2

In [842]:
dfdatacl.loc[dfdatacl['Cluster'] == 2][['city','country']]

Unnamed: 0,city,country
0,Zurich,Switzerland
1,Geneve,Switzerland
2,Basel,Switzerland
4,Dublin,Ireland
6,Cork,Ireland
7,Dun Laoghaire,Ireland
8,Oslo,Norway
9,Bergen,Norway


### List of Cities of Cluster 1

In [844]:
dfdatacl.loc[dfdatacl['Cluster'] == 1][['city','country']]

Unnamed: 0,city,country
3,London,United Kingdom
5,Moscow,Russia


### List of Cities of Cluster 0

In [846]:
dfdatacl.loc[dfdatacl['Cluster'] == 0]['city'].tolist()


['Goeteborg',
 'Malmoe',
 'Graz',
 'Linz',
 'Favoriten',
 'Floridsdorf',
 'Rotterdam',
 'Birmingham',
 'Koeln',
 'Marseille',
 'Nottingham',
 'Utrecht',
 'Sheffield',
 'Tilburg',
 'Groningen',
 'Bristol',
 'Breda',
 'Nijmegen',
 'Frankfurt am Main',
 'Glasgow',
 'Nicosia',
 'Lyon',
 'Stuttgart',
 'Dortmund',
 'Duesseldorf',
 'Toulouse',
 'Leicester',
 'Bremen',
 'Edinburgh',
 'Leeds',
 'Leipzig',
 'Duisburg',
 'Nuernberg',
 'Nice',
 'Manchester',
 'Stoke-on-Trent',
 'Antwerpen',
 'Coventry',
 'Nantes',
 'Wandsbek',
 'Strasbourg',
 'Sunderland',
 'Bochum',
 'Montpellier',
 'Birkenhead',
 'Bochum-Hordel',
 'Islington',
 'Reading',
 'Copenhagen',
 'Kingston upon Hull',
 'Preston',
 'Bordeaux',
 'Lille',
 'Newport',
 'Wuppertal',
 'Swansea',
 'Southend-on-Sea',
 'Rennes',
 'Reims',
 'Belfast',
 'Derby',
 'Le Havre',
 'Cergy-Pontoise',
 'Saint-Etienne']

# The END!