# Cities Scoring - Clustering

## Hold down the control key to do the selection : 

In [78]:
import pandas as pd
import numpy as np
import voila
import ipywidgets as widgets
import IPython
from ipywidgets import interact, interactive, fixed, interact_manual
pd.options.mode.chained_assignment = None  # default='warn'

import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px
import plotly.graph_objects as go

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

from sklearn.metrics import silhouette_score
from scipy.signal import argrelextrema

def normalize_data(df):
    scaler = MinMaxScaler()
    df_normalized = scaler.fit_transform(df)
    df_normalized = pd.DataFrame(df_normalized)
    df_normalized.columns = df.columns
    df_normalized.index = df.index
    return df_normalized

drop_columns = ["Statut","Tiering","how to drive","relief","part de l'environnement","ville innovante",
                "Annual income per capita","implementation period","condition of admissions","#moped service",
                "Number of  yearly trip per inhabitant in PT 2018",
                "Commute distance by public transit (km)",
                "Annual income per capita","Year 1st moped service",
                'Mopeds / active capita',
               "#scooter service","#bike service","motorization rate (#vehicles per 100 inhab.)","#competitor mopeds ",
               "#competitors","#mobility service available",'Commute time by public transit (min)',
               'Average price of 2h parking ','average precipitation per year (mm)']

def data_preprocessing(df, drop_columns):
    
    df = df.drop(drop_columns,axis=1)
    df = df.dropna(axis=1,how="all")
    df = df.dropna(how="all")

    avg_tourist = df["tourist by year 2019"].mean()
    df["tourist by year 2019"].fillna(avg_tourist,inplace=True)
    
    avg_road_quality = df['Road Quality  (coya)'].mean()
    df['Road Quality  (coya)'].fillna(avg_road_quality,inplace=True)
    
    avg_coya_index = df['Coya Bicycle Index'].mean()
    df['Coya Bicycle Index'].fillna(avg_coya_index,inplace=True)
    
    avg_coya_index = df['Coya Sharing Score'].mean()
    df['Coya Sharing Score'].fillna(avg_coya_index,inplace=True)
    
    df = df.rename({'Country Environmental Awareness Indicator':"Country Environmental Awareness",
                   "Covid Stringency Mean Index (by country)":"Covid Stringency Index",
                   'density (capita/square)':'Density (capita/square)',
                   'income per capita per month (net€)':'Income per capita per month (net€)',
                   'congestion rate (2019)':'Congestion rate (2019)',
                   'average temperature':'Average temperature',
                   'tourist by year 2019':'Tourist by year (2019)'}, axis='columns')
    

    df = df.set_index('City')
    
    return df

df_excel = pd.read_excel('..\Downloads\City Scoring Dashboard - Steve (4).xlsx', sheet_name='Critérisation')

df = df_excel
df.columns = df.iloc[1]
df = df.loc[2:]
df = data_preprocessing(df, drop_columns)

Y_avg = df["Average in 2020"]
Y_max = df["Max 2020"]
Y_tot = df["Total 2020-2021"]

df.drop(["Average in 2020","Max 2020","Total 2020-2021"],axis=1,inplace=True)


df_normal = normalize_data(df)

def reduce_dim(df,dim):
    
    X = PCA(dim, svd_solver='full').fit_transform(df)
    PCA_df = pd.DataFrame(X)
    PCA_df.index = df.index
    
    return (PCA_df)

def get_clusters(df,nb_features):
    
    df.columns = [i for i in range(len(df.columns))]
    
    labels = []
    scores = []

    for nb_clusters in np.arange(2,10):

        kmeans = KMeans(nb_clusters, random_state=42)
        kmeans.fit(df)
        label = kmeans.predict(df)
        score = silhouette_score(df, label, metric='euclidean')
        labels.append(label)
        scores.append(score)
    
    max_score = np.max(scores)
    max_idx = np.argmax(scores)
    
    best_labels = labels[max_idx]
    
    df["Label"] = best_labels
    df["City"] = df.index
    df["Rotation avg"] = Y_avg.astype("float64")
    df["Rotation max"] = Y_max.astype("float64")
    df["Nb. Loc"] = Y_tot.astype("float64")
    
    clusters_target_values = df.groupby("Label").mean().drop([i for i in range(nb_features)],axis=1)
    clusters_target_values.index = [f' Cluster {i}' for i in range(len(clusters_target_values.index))]
    
    
    print(f'Clustering score {round(max_score,3)}')
    print()
    
    for label in set(best_labels):
        cluster = (df.index)[df["Label"] == label]
        score_max = clusters_target_values.loc[f' Cluster {label}',"Rotation max"]
        score_avg = clusters_target_values.loc[f' Cluster {label}',"Rotation avg"]
        score_tot = clusters_target_values.loc[f' Cluster {label}',"Nb. Loc"]
        cluster = list(cluster)
        print(f'- cluster {label} : {cluster}')
        print()
    
    display(clusters_target_values)
     
    return (df)

def visualize_3D(df):
    
    fig = px.scatter_3d(df, x=0, y=1, z=2, color="Label",text="City",width=1000, height=600,)
    fig.update_layout(showlegend=False)
    fig.show()  
    
def visualize_2D(df):
    
    fig = px.scatter(df, x=df.columns[0], y=df.columns[1], color="Label",text="City",width=1000, height=600)
    fig.update_traces(textposition='top center')
    fig.update_layout(showlegend=False)
    fig.show()
        
    
def run_your_clustering(features_selected):
    
    if len(features_selected) >= 3:
        df_select = df_normal[list(features_selected)]
        PCA_df = reduce_dim(df_select,3)
        cluster_df = get_clusters(PCA_df,nb_features = 3)
        visualize_3D(cluster_df)
    
    elif len(features_selected) == 2:
        df_select = df_normal[list(features_selected)]
        cluster_df = get_clusters(df_select,nb_features = 2)
        visualize_2D(cluster_df)
        
    else:
        print("Choose at least 2 parameters")
    
    

interact(run_your_clustering,features_selected=widgets.SelectMultiple(
options=list(df_normal.columns),
value=['Density (capita/square)','Income per capita per month (net€)','Congestion rate (2019)'],
rows=16,
description='Features selected',
disabled=False
));

display(df)

interactive(children=(SelectMultiple(description='Features selected', index=(0, 1, 2), options=('Density (capi…

# For k means and Concurrence Paramaters

In [59]:
class SelectMultipleInteract(widgets.HBox):

    def __init__(self):
        self.W1 = widgets.SelectMultiple(
            options=['Automatic', '1', '2'],
            value=['Automatic'],
            #rows=10,
            description='Number of Clusters',
            disabled=False
        )

        self.W2 = widgets.SelectMultiple(
            options=list(df_normal.columns),
            value=['Density (capita/square)','Income per capita per month (net€)','Congestion rate (2019)'],
            rows=16,
            description='Features selected',
            disabled=False
        )

        self.selectors = [self.W1, self.W2]
        super().__init__(children=self.selectors)
        self._set_observes()

    def _set_observes(self):
        for widg in self.selectors:
            widg.observe(self.run_your_clustering, names='value')

    def run_your_clustering(self,widg):
        
        #IPython.display.clear_output() 
        features_selected = self.selectors[1].get_interact_value()
    
        if len(features_selected) >= 3:
            df_select = df_normal[list(features_selected)]
            PCA_df = reduce_dim(df_select,3)
            cluster_df = get_clusters(PCA_df)
            visualize_3D(cluster_df)

        elif len(features_selected) == 2:
            df_select = df_normal[list(features_selected)]
            cluster_df = get_clusters(df_select)
            visualize_2D(cluster_df)

        else:
            print("Choose at least 2 parameters")
        

SelectMultipleInteract()

SelectMultipleInteract(children=(SelectMultiple(description='Number of Clusters', index=(0,), options=('Automa…

Choose at least 2 parameters
