In [None]:
import numpy as np
import pandas as pd
import pandas_profiling

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns;

from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans, SpectralClustering

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline

import umap

In [None]:
pd.options.display.max_columns = 200

# Data Loading - Old Data

In [None]:
salurbal_old_df = pd.read_csv( '../data/datos.csv' )[ [ 'Country', 'L1Name', 'L2Namev2' ] ]

In [None]:
salurbal_old_df.shape

In [None]:
#salurbal_old_df.head()

# Data Loading

In [None]:
salurbal_df = pd.read_excel( '../data/modelos L2.xlsx' )

In [None]:
salurbal_df.shape

In [None]:
#salurbal_df.head()

# Merging for get L1Names

In [None]:
salurbal_df[ 'L1Name' ] = salurbal_old_df[ 'L1Name' ]

In [None]:
salurbal_df.shape

In [None]:
#salurbal_df.head()

# Cleaning Data

In [None]:
# Eliminación de columnas sin utilidad
del salurbal_df[ 'salid2' ]
del salurbal_df[ 'bicycle lane' ]

In [None]:
# Se elimina el modelo completo que ya no se utilizará
del salurbal_df[ 'clase-completo' ]
del salurbal_df[ 'prob1-completo' ]
del salurbal_df[ 'prob2-completo' ]
del salurbal_df[ 'prob3-completo' ]

In [None]:
salurbal_df.rename( columns = { 'L2Namev2': 'L2NAME', 'Country': 'COUNTRY', 'L1Name': 'L1NAME',
                               'clase-BE': 'MBE_CLUSTER', 'prob1-BE': 'MBE_PROB1', 'prob2-BE': 'MBE_PROB2', 'prob3-BE': 'MBE_PROB3', 'prob4-BE': 'MBE_PROB4', 'prob5-BE': 'MBE_PROB5', 
                               'clase_Transport': 'MT_CLUSTER', 'prob1-Transport': 'MT_PROB1', 'prob2-Transport': 'MT_PROB2', 'prob3-Transport': 'MT_PROB3', 'prob4-Transport': 'MT_PROB4', 'prob5-Transport': 'MT_PROB5' 
                              }, inplace = True )

In [None]:
salurbal_df.shape

In [None]:
#salurbal_df.dtypes

In [None]:
salurbal_df[ 'MT_CLUSTER' ].value_counts( dropna = False )

In [None]:
salurbal_df[ 'MT_PROB' ] = salurbal_df.apply( lambda x: x[ 'MT_PROB' + str( x[ 'MT_CLUSTER' ] ) ], axis = 1 )

In [None]:
salurbal_df.loc[ salurbal_df[ 'MT_PROB' ] >= .9, 'MT_CLUSTER' ].value_counts( dropna = False )

In [None]:
# L2s with membership probability major or equal than 0.9
salurbal_df.loc[ salurbal_df[ 'MT_PROB' ] >= .9, 'MT_CLUSTER' ].value_counts( dropna = False ).sum() / salurbal_df[ 'MT_CLUSTER' ].value_counts( dropna = False ).sum()

In [None]:
salurbal_df[ 'MBE_CLUSTER' ].value_counts( dropna = False )

In [None]:
salurbal_df[ 'MBE_PROB' ] = salurbal_df.apply( lambda x: x[ 'MBE_PROB' + str( x[ 'MBE_CLUSTER' ] ) ], axis = 1 )

In [None]:
salurbal_df.loc[ salurbal_df[ 'MBE_PROB' ] >= .9, 'MBE_CLUSTER' ].value_counts( dropna = False )

In [None]:
# L2s with membership probability major or equal than 0.9
salurbal_df.loc[ salurbal_df[ 'MBE_PROB' ] >= .9, 'MBE_CLUSTER' ].value_counts( dropna = False ).sum() / salurbal_df[ 'MBE_CLUSTER' ].value_counts( dropna = False ).sum()

In [None]:
# Crea una columna contando la cantidad de valores faltantes
salurbal_df[ 'missings' ] = salurbal_df.isnull().sum( axis = 1 )

In [None]:
# Removing of records with more than 2 missing values  => 3 records removed
salurbal_df = salurbal_df.loc[ salurbal_df[ 'missings' ] <= 2 ]

In [None]:
salurbal_df[ 'missings' ].value_counts( dropna = False )

In [None]:
# Muestra las columnas con valores faltantes
salurbal_df.isnull().sum( axis = 0 ).sort_values( ascending = False ).head( 2 )

In [None]:
# Imputation with median
salurbal_df.loc[ salurbal_df[ 'BECAWMNNNGHL2' ].isnull(), 'BECAWMNNNGHL2' ] = salurbal_df[ 'BECAWMNNNGHL2' ].median()

In [None]:
del salurbal_df[ 'missings' ]

In [None]:
features = [ 'BECTUAREAL2', 'BECPCTURBANL2', 'BECNURBPTCHL2', 'BECPTCHDENSL2', 'BECAVGPTCHAREAL2', 'BECCVPTCHAREAL2', 'BECAWMNSHPINDXL2', 'BECAWMNNNGHL2', 'BECEDGDENSL2', 'BECEFFMESHSIZEL2', 'BECAWAVGPTCHAREAL2', 'BECAWEDGDENSL2', 'BECTPOPL2', 'BECPOPDENSL2', 'BECPOPCONCL2', 'BECTPOPADJL2', 'BECPOPDENSADJL2', 'BECPRSBRTL2', 'BECPRSSUBWAYL2', 'BECBYLANELNGTHL2', 'BECADAREAL2', 'BECADCRCTYAVGL2', 'BECADINTDENS3L2', 'BECADINTDENS4L2', 'BECADINTDENSL2', 'BECADLRDENSL2', 'BECADSTTDENSL2', 'BECADSTTPNODEAVGL2', 'BECADSTTPNODESDL2', 'BECADSTTLGAVGL2', 'BECURBTRVDELAYINDEXL2' ]

In [None]:
mbe_features = [ 'BECAWMNSHPINDXL2', 'BECEFFMESHSIZEL2', 'BECAWMNNNGHL2', 'BECAWEDGDENSL2', 'BECPTCHDENSL2', 'BECNURBPTCHL2', 'BECAWAVGPTCHAREAL2' ]

In [None]:
mt_features = [ 'BECPRSBRTL2', 'BECPRSSUBWAYL2', 'BECADSTTDENSL2', 'BECADSTTLGAVGL2', 'BECADINTDENSL2', 'BECADCRCTYAVGL2', 'BECADSTTPNODEAVGL2' ]

In [None]:
salurbal_df.head()

# Data Exploration

In [None]:
sns.pairplot( salurbal_df, vars = mbe_features, hue = 'MBE_CLUSTER' )

In [None]:
sns.pairplot( salurbal_df, vars = mt_features, hue = 'MT_CLUSTER' )

# Complementary Clustering

### K-Means

In [None]:
kmeans_pipeline = Pipeline( [
    ( 'scaling', StandardScaler() ),
    ( 'clustering', KMeans( n_clusters = 5, n_jobs = -1, random_state = 1 ) )
] )

In [None]:
kmeans_mbe_salurbal = kmeans_pipeline.fit_predict( salurbal_df[ mbe_features ] )

for i in range( kmeans_mbe_salurbal.shape[ 0 ] ):
    kmeans_mbe_salurbal[ i ] = kmeans_mbe_salurbal[ i ] + 1

In [None]:
kmeans_mt_salurbal = kmeans_pipeline.fit_predict( salurbal_df[ mt_features ] )

for i in range( kmeans_mt_salurbal.shape[ 0 ] ):
    kmeans_mt_salurbal[ i ] = kmeans_mt_salurbal[ i ] + 1

In [None]:
salurbal_df[ 'MBE_CLUSTER_KMEANS' ] = kmeans_mbe_salurbal

salurbal_df[ 'MT_CLUSTER_KMEANS' ] = kmeans_mt_salurbal

In [None]:
salurbal_df[ 'MBE_CLUSTER_KMEANS' ].value_counts( dropna = False )

In [None]:
salurbal_df[ 'MT_CLUSTER_KMEANS' ].value_counts( dropna = False )

In [None]:
sns.pairplot( salurbal_df, vars = mbe_features, hue = 'MBE_CLUSTER_KMEANS' )

In [None]:
sns.pairplot( salurbal_df, vars = mt_features, hue = 'MT_CLUSTER_KMEANS' )

### Spectral Clustering

In [None]:
spectral_pipeline = Pipeline( [
    ( 'scaling', StandardScaler() ),
    ( 'clustering', SpectralClustering( n_clusters = 5, n_jobs = -1, random_state = 1 ) )
] )

In [None]:
spectral_mbe_salurbal = spectral_pipeline.fit_predict( salurbal_df[ mbe_features ] )

for i in range( spectral_mbe_salurbal.shape[ 0 ] ):
    spectral_mbe_salurbal[ i ] = spectral_mbe_salurbal[ i ] + 1

In [None]:
spectral_mt_salurbal = spectral_pipeline.fit_predict( salurbal_df[ mt_features ] )

for i in range( spectral_mt_salurbal.shape[ 0 ] ):
    spectral_mt_salurbal[ i ] = spectral_mt_salurbal[ i ] + 1

In [None]:
salurbal_df[ 'MBE_CLUSTER_SPECTRAL' ] = spectral_mbe_salurbal

salurbal_df[ 'MT_CLUSTER_SPECTRAL' ] = spectral_mt_salurbal

In [None]:
salurbal_df[ 'MBE_CLUSTER_SPECTRAL' ].value_counts( dropna = False )

In [None]:
salurbal_df[ 'MT_CLUSTER_SPECTRAL' ].value_counts( dropna = False )

In [None]:
sns.pairplot( salurbal_df, vars = mbe_features, hue = 'MBE_CLUSTER_SPECTRAL' )

In [None]:
sns.pairplot( salurbal_df, vars = mt_features, hue = 'MT_CLUSTER_SPECTRAL' )

# Dimensionality Reduction

### T-SNE

In [None]:
tsne_pipeline = Pipeline( [
    ( 'scaling', StandardScaler() ),
    ( 'dimred', TSNE( random_state = 1 ) )
] )

In [None]:
tsne_mbe_salurbal = tsne_pipeline.fit_transform( salurbal_df[ mbe_features ] )

In [None]:
tsne_mt_salurbal = tsne_pipeline.fit_transform( salurbal_df[ mt_features ] )

In [None]:
salurbal_df[ 'TSNE_MBE_X' ] = tsne_mbe_salurbal[ :, 0 ]
salurbal_df[ 'TSNE_MBE_Y' ] = tsne_mbe_salurbal[ :, 1 ]

salurbal_df[ 'TSNE_MT_X' ] = tsne_mt_salurbal[ :, 0 ]
salurbal_df[ 'TSNE_MT_Y' ] = tsne_mt_salurbal[ :, 1 ]

### PCA

In [None]:
pca_pipeline = Pipeline( [
    ( 'scaling', StandardScaler() ),
    ( 'dimred', PCA( n_components = 2, random_state = 1 ) )
] )

In [None]:
pca_mbe_salurbal = pca_pipeline.fit_transform( salurbal_df[ mbe_features ] )
pca_pipeline.named_steps[ 'dimred' ].explained_variance_ratio_

In [None]:
pca_mt_salurbal = pca_pipeline.fit_transform( salurbal_df[ mt_features ] )
pca_pipeline.named_steps[ 'dimred' ].explained_variance_ratio_

In [None]:
salurbal_df[ 'PCA_MBE_X' ] = pca_mbe_salurbal[ :, 0 ]
salurbal_df[ 'PCA_MBE_Y' ] = pca_mbe_salurbal[ :, 1 ]

salurbal_df[ 'PCA_MT_X' ] = pca_mt_salurbal[ :, 0 ]
salurbal_df[ 'PCA_MT_Y' ] = pca_mt_salurbal[ :, 1 ]

### UMAP

In [None]:
umap_pipeline = Pipeline( [
    ( 'scaling', StandardScaler() ),
    ( 'dimred', umap.UMAP() )
] )

In [None]:
umap_mbe_salurbal = umap_pipeline.fit_transform( salurbal_df[ mbe_features ] )

In [None]:
umap_mt_salurbal = umap_pipeline.fit_transform( salurbal_df[ mt_features ] )

In [None]:
salurbal_df[ 'UMAP_MBE_X' ] = umap_mbe_salurbal[ :, 0 ]
salurbal_df[ 'UMAP_MBE_Y' ] = umap_mbe_salurbal[ :, 1 ]

salurbal_df[ 'UMAP_MT_X' ] = umap_mt_salurbal[ :, 0 ]
salurbal_df[ 'UMAP_MT_Y' ] = umap_mt_salurbal[ :, 1 ]

# Visualization

In [None]:
f, ( ax1, ax2, ax3 ) = plt.subplots( 1, 3,  figsize = ( 25, 7 ) )

model = 'MBE'
clustering = ''
methods = { 'PCA' : ax1, 'TSNE' : ax2, 'UMAP' : ax3 }
for method in methods:
    for c in range( 1, 6 ):
        methods[ method ].scatter( salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_X' ], salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_Y' ], label = str( c ), edgecolor = 'black', alpha = 0.5 )
    methods[ method ].set_title( method )
    methods[ method ].legend()
plt.show()

In [None]:
f, ( ax1, ax2, ax3 ) = plt.subplots( 1, 3,  figsize = ( 25, 7 ) )

model = 'MBE'
clustering = '_KMEANS'
methods = { 'PCA' : ax1, 'TSNE' : ax2, 'UMAP' : ax3 }
for method in methods:
    for c in range( 1, 6 ):
        methods[ method ].scatter( salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_X' ], salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_Y' ], label = str( c ), edgecolor = 'black', alpha = 0.5 )
    methods[ method ].set_title( method )
    methods[ method ].legend()
plt.show()

In [None]:
f, ( ax1, ax2, ax3 ) = plt.subplots( 1, 3,  figsize = ( 25, 7 ) )

model = 'MBE'
clustering = '_SPECTRAL'
methods = { 'PCA' : ax1, 'TSNE' : ax2, 'UMAP' : ax3 }
for method in methods:
    for c in range( 1, 6 ):
        methods[ method ].scatter( salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_X' ], salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_Y' ], label = str( c ), edgecolor = 'black', alpha = 0.5 )
    methods[ method ].set_title( method )
    methods[ method ].legend()
plt.show()

In [None]:
f, ( ax1, ax2, ax3 ) = plt.subplots( 1, 3,  figsize = ( 25, 7 ) )

model = 'MT'
clustering = ''
methods = { 'PCA' : ax1, 'TSNE' : ax2, 'UMAP' : ax3 }
for method in methods:
    for c in range( 1, 6 ):
        methods[ method ].scatter( salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_X' ], salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_Y' ], label = str( c ), edgecolor = 'black', alpha = 0.5 )
    methods[ method ].set_title( method )
    methods[ method ].legend()
plt.show()

In [None]:
f, ( ax1, ax2, ax3 ) = plt.subplots( 1, 3,  figsize = ( 25, 7 ) )

model = 'MT'
clustering = '_KMEANS'
methods = { 'PCA' : ax1, 'TSNE' : ax2, 'UMAP' : ax3 }
for method in methods:
    for c in range( 1, 6 ):
        methods[ method ].scatter( salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_X' ], salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_Y' ], label = str( c ), edgecolor = 'black', alpha = 0.5 )
    methods[ method ].set_title( method )
    methods[ method ].legend()
plt.show()

In [None]:
f, ( ax1, ax2, ax3 ) = plt.subplots( 1, 3,  figsize = ( 25, 7 ) )

model = 'MT'
clustering = '_SPECTRAL'
methods = { 'PCA' : ax1, 'TSNE' : ax2, 'UMAP' : ax3 }
for method in methods:
    for c in range( 1, 6 ):
        methods[ method ].scatter( salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_X' ], salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_Y' ], label = str( c ), edgecolor = 'black', alpha = 0.5 )
    methods[ method ].set_title( method )
    methods[ method ].legend()
plt.show()

In [None]:
salurbal_df.head()

In [None]:
salurbal_df.to_csv( '../data/modelos_l2.csv', index = False )

In [None]:
salurbal_df.to_csv( '../data/modelos_l2_comma.csv', sep = ';', decimal = ',' index = False )

# Explanations

In [None]:
import lime
import lime.lime_tabular

In [None]:
import sklearn
import sklearn.datasets
import sklearn.ensemble
import numpy as np
import lime
import lime.lime_tabular
from __future__ import print_function
np.random.seed(1)

In [None]:
iris = sklearn.datasets.load_iris()

In [None]:
train, test, labels_train, labels_test = sklearn.model_selection.train_test_split(iris.data, iris.target, train_size=0.80)

In [None]:
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)
rf.fit(train, labels_train)

In [None]:
sklearn.metrics.accuracy_score(labels_test, rf.predict(test))

In [None]:
iris.target_names

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(train, feature_names=iris.feature_names, class_names=iris.target_names, discretize_continuous=True)

In [None]:
i = np.random.randint(0, test.shape[0])
exp = explainer.explain_instance(test[i], rf.predict_proba, num_features=2, top_labels=1)

In [None]:
exp.show_in_notebook(show_table=True, show_all=False)

In [None]:
test.shape

In [None]:
#salurbal_df[ [ 'MT_PROB1', 'MT_PROB2', 'MT_PROB3', 'MT_PROB4', 'MT_PROB5' ] ].sum( axis = 1 )

In [None]:
salurbal_df[ mt_features ].dtypes

In [None]:
salurbal_mx = salurbal_df[ mt_features ].values
salurbal_mx.shape

In [None]:
salurbal_mx_probs = lambda x: salurbal_df[ [ 'MT_PROB1', 'MT_PROB2', 'MT_PROB3', 'MT_PROB4', 'MT_PROB5' ] ].values
salurbal_mx_probs

In [None]:
salurbal_mx_probs( 1 )

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer( salurbal_mx, feature_names = mt_features, class_names = [ '1' , '2', '3', '4', '5' ] )

In [None]:
i = np.random.randint( 0, salurbal_mx.shape[ 0 ] )
i

In [None]:
exp = explainer.explain_instance( salurbal_mx[ i ], salurbal_mx_probs, num_features = 7 )