# Cities Scoring - Clustering

This application aims to compare major European cities on different parameters. 

Choose the parameters to use for the comparison in the Features Selector menu. 
You have to select at least two of them. In case more than four parameters are selected, a dimension reduction technique is applied to visualize the cities in a 3D space. The second Features Selector menu gathers the parameters that are related to the competitive pressure. Some of these parameters (#scooter service, #moped service, #bike service) are binary and should be chosen with care.

A clustering is then performed. The quality of this clustering is measured with a silhouette score. 1 is the best score and -1 is the worst. You can choose the number of clusters you want. Otherwise, we perform a partioning in k clusters where the k chosen is the one giving the best score.

We also provide, for each cluster, the average value of the Nb. of Loc, the average Rotation and the max Rotation of the cities present in this cluster. If there is no operator in any of the cities in the cluster, target values are NaN.

The raw data is given at the end.

## Hold down the control key to do the selection : 

In [23]:
import pandas as pd
import numpy as np
import voila
import ipywidgets as widgets
import IPython
from ipywidgets import interact, interactive, fixed, interact_manual
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px
import plotly.graph_objects as go

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

from sklearn.metrics import silhouette_score
from scipy.signal import argrelextrema


def normalize_data(df):
    scaler = MinMaxScaler()
    df_normalized = scaler.fit_transform(df)
    df_normalized = pd.DataFrame(df_normalized)
    df_normalized.columns = df.columns
    df_normalized.index = df.index
    return df_normalized

drop_columns = ["Statut","Tiering","how to drive","relief","part de l'environnement","ville innovante",
                "Annual income per capita","implementation period","condition of admissions",
                "Number of  yearly trip per inhabitant in PT 2018",
                "Commute distance by public transit (km)",
                "Annual income per capita","Year 1st moped service",
                'Mopeds / active capita',
               "motorization rate (#vehicles per 100 inhab.)",
               'Commute time by public transit (min)',
               'Average price of 2h parking ','average precipitation per year (mm)']

def data_preprocessing(df, drop_columns):
    
    df = df.drop(drop_columns,axis=1)
    df = df.dropna(axis=1,how="all")
    df = df.dropna(how="all")
    
    df["#scooter service"] = df["#scooter service"].map({'Yes': 1, 'No': 0})
    df["#bike service"] = df["#bike service"].map({'Yes': 1, 'No': 0})
    df["#moped service"] = df["#moped service"].map({'Yes': 1, 'No': 0})
    
    df["#scooter service"].fillna(0,inplace=True)
    df["#bike service"].fillna(0,inplace=True)
    df["#moped service"].fillna(0,inplace=True)
    df["#competitor mopeds "].fillna(0,inplace=True)
    df["#competitors"].fillna(0,inplace=True)
    df["#mobility service available"].fillna(0,inplace=True)


    avg_tourist = df["tourist by year 2019"].mean()
    df["tourist by year 2019"].fillna(avg_tourist,inplace=True)
    
    avg_road_quality = df['Road Quality  (coya)'].mean()
    df['Road Quality  (coya)'].fillna(avg_road_quality,inplace=True)
    
    avg_coya_index = df['Coya Bicycle Index'].mean()
    df['Coya Bicycle Index'].fillna(avg_coya_index,inplace=True)
    
    avg_coya_index = df['Coya Sharing Score'].mean()
    df['Coya Sharing Score'].fillna(avg_coya_index,inplace=True)
    
    df = df.rename({'Country Environmental Awareness Indicator':"Country Environmental Awareness",
                   "Covid Stringency Mean Index (by country)":"Covid Stringency Index",
                   'density (capita/square)':'Density (capita/square)',
                   'income per capita per month (net€)':'Income per capita per month (net€)',
                   'congestion rate (2019)':'Congestion rate (2019)',
                   'average temperature':'Average temperature',
                   'tourist by year 2019':'Tourist by year (2019)'}, axis='columns')
    

    df = df.set_index('City')
    
    return df

#df_excel = pd.read_excel('..\Downloads\City Scoring Dashboard - Steve (4).xlsx', sheet_name='Critérisation')

df_init = df_excel
df_init.columns = df_init.iloc[1]
df_init = df_init.loc[2:]
df_init = data_preprocessing(df_init, drop_columns)

Y_avg = df_init["Average in 2020"]
Y_max = df_init["Max 2020"]
Y_tot = df_init["Total 2020-2021"]

df_init.drop(["Average in 2020","Max 2020","Total 2020-2021"],axis=1,inplace=True)
adversarial_features = ["#scooter service","#bike service","#moped service","#competitor mopeds ","#competitors","#mobility service available"]
other_features = list(df_init.drop(adversarial_features,axis=1).columns)


df_normal = normalize_data(df_init)

def reduce_dim(df,dim):
    
    X = PCA(dim, svd_solver='full').fit_transform(df)
    PCA_df = pd.DataFrame(X)
    PCA_df.index = df.index
    
    return (PCA_df)

def get_clusters(df,nb_features,k_selected,features_selected):
    
    df.columns = [i for i in range(len(df.columns))]
    
    if k_selected == "Default":
        
        labels = []
        scores = []

        for nb_clusters in np.arange(2,8):

            kmeans = KMeans(nb_clusters, random_state=42)
            kmeans.fit(df)
            label = kmeans.predict(df)
            score = silhouette_score(df, label, metric='euclidean')
            labels.append(label)
            scores.append(score)

        max_score = np.max(scores)
        max_idx = np.argmax(scores)
        best_labels = labels[max_idx]
    
    else:
        
        kmeans = KMeans(k_selected, random_state=42)
        kmeans.fit(df)
        best_labels = kmeans.predict(df)
        max_score = silhouette_score(df,best_labels, metric='euclidean')
        

    
    df["Label"] = best_labels
    df["City"] = df.index
    df["Rotation avg"] = Y_avg.astype("float64")
    df["Rotation max"] = Y_max.astype("float64")
    df["Nb. Loc"] = Y_tot.astype("float64")
    df[features_selected] = df_init[features_selected].astype("float64")
    
    
    clusters_target_values = df.groupby("Label").mean().drop([i for i in range(nb_features)],axis=1)
    clusters_target_values = clusters_target_values.apply(lambda row : round(row,2),axis=1)
    clusters_target_values["Nb. Loc"] = clusters_target_values["Nb. Loc"].apply(lambda x : "{:e}".format(x))
    clusters_target_values.index = [f' Cluster {i}' for i in range(len(clusters_target_values.index))]
    
    
    print(f'Clustering score {round(max_score,3)}')
    print()
    
    for label in set(best_labels):
        cluster = (df.index)[df["Label"] == label]
        cluster = list(cluster)
        print(f'- cluster {label} : {cluster}')
        print()
    
    display(clusters_target_values)
    
    return (df)

def visualize_3D(df):
    
    fig = px.scatter_3d(df, x=0, y=1, z=2, color="Label",text="City",width=1000, height=600)
    fig.update_layout(showlegend=False)
    fig.show()  
    
def visualize_2D(df):
    
    fig = px.scatter(df, x=df.columns[0], y=df.columns[1], color="Label",text="City",width=1000, height=600)
    fig.update_traces(textposition='top center')
    fig.update_layout(showlegend=False)
    fig.show()
        
def run_your_clustering(features_selected_1,features_selected_2,k_selected):
    
    print('\n' * 3)
    
    features_selected = list(features_selected_1) + list(features_selected_2)
    
    if len(features_selected) >= 3:
        df_select = df_normal[features_selected]
        PCA_df = reduce_dim(df_select,3)
        cluster_df = get_clusters(PCA_df,nb_features = 3,k_selected=k_selected,features_selected=features_selected)
        visualize_3D(cluster_df)
    
    elif len(features_selected) == 2:
        df_select = df_normal[features_selected]
        cluster_df = get_clusters(df_select,nb_features = 2,k_selected=k_selected,features_selected=features_selected)
        visualize_2D(cluster_df)
        
    else:
        print("Choose at least 2 parameters")
    
    

interact(run_your_clustering,
         
         features_selected_1=widgets.SelectMultiple(
options=other_features,
value=['Density (capita/square)','Income per capita per month (net€)','Congestion rate (2019)'],
rows=16,
description='Features selected',
disabled=False
),
         
         features_selected_2=widgets.SelectMultiple(
options=adversarial_features,
value=[],
rows=6,
description='Features selected',
disabled=False
),
         
        k_selected=widgets.Dropdown(
    options=["Default",2,3,4,5,6],
    value="Default",
    description='K Value:',
    disabled=False
));


display(df_init)

interactive(children=(SelectMultiple(description='Features selected', index=(0, 1, 2), options=('Density (capi…

1,Density (capita/square),Income per capita per month (net€),Congestion rate (2019),% active population,Average temperature,#mobility service available,#scooter service,#bike service,#moped service,#competitors,#competitor mopeds,Tourist by year (2019),Covid Stringency Index,Superficie (km2),Raining days,Avg cost of a monthly pass for PT,Safety score,Coya Sharing Score,Coya Bicycle Index,Road Quality (coya),Country Environmental Awareness
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
amsterdam,5200,3501.5,0.26,0.82,10.7,2,0.0,0.0,1.0,2,371.010753,8835400.0,58.0,219,132,97.5,66.32,33.0,60.24,61.71,64.7
rotterdam,1996,3227.75,0.25,0.794,10.8,3,0.0,1.0,1.0,3,918.774194,1200000.0,58.0,324,131,80.0,64.13,30.703704,35.786296,46.467778,64.7
bordeaux,5225,3072.25,0.32,0.737,13.8,3,1.0,1.0,1.0,2,193.60215,1490000.0,59.5,49,173,45.0,56.62,77.0,45.42,59.4,53.4
barcelona,16043,2871.83,0.29,0.806,15.5,3,1.0,1.0,1.0,10,5281.930108,7016600.0,62.1,102,55,40.0,54.49,32.0,33.7,54.15,50.6
paris,20386,3862.58,0.39,0.815,11.7,4,1.0,1.0,1.0,2,2961.0,19087900.0,59.5,105,111,75.0,46.81,81.0,37.53,58.51,53.4
berlin,4207,3048.0,0.32,0.839,10.1,4,1.0,1.0,1.0,3,1188.77957,6195800.0,60.9,892,106,82.0,58.86,17.0,42.59,50.79,69.8
the_hague,5441,3778.67,0.28,0.794,10.9,3,1.0,1.0,1.0,3,317.483871,1000000.0,58.0,98,198,52.5,74.84,30.703704,35.786296,46.467778,64.7
rome,2239,2791.92,0.38,0.739,15.8,3,1.0,0.0,1.0,4,1373.11828,10317000.0,67.2,1285,78,35.0,47.87,2.0,27.03,34.17,46.1
eindhoven,2533,3148.5,0.22,0.804,10.9,1,0.0,0.0,1.0,2,101.994624,770000.0,58.0,89,154,89.5,75.95,30.703704,35.786296,46.467778,64.7
hamburg,2414,3337.67,0.34,0.859,9.8,4,1.0,1.0,1.0,2,331.537634,5558457.0,60.9,755,129,91.5,56.74,27.0,44.97,54.9,69.8
