In [1]:
import os
import requests
import numpy as np
import pandas as pd
import pandas_profiling

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns;

from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans, SpectralClustering

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline

import umap

In [2]:
pd.options.display.max_columns = 200

In [17]:
dataset_sufix = 'L2'

# Data Loading

In [3]:
salurbal_df = pd.read_excel( '../data/base_l2.xlsx' )

In [4]:
# Deleting columns
del salurbal_df[ 'SALID1' ]
del salurbal_df[ 'SALID2' ]

del salurbal_df[ 'urb-pmp-1' ]
del salurbal_df[ 'urb-pmp-2' ]
del salurbal_df[ 'urb-pmp-3' ]
del salurbal_df[ 'urb-pmp-4' ]
del salurbal_df[ 'urb-pmp-5' ]
del salurbal_df[ 'urb-pmp-6' ]

del salurbal_df[ 'stt-pmp-1' ]
del salurbal_df[ 'stt-pmp-2' ]
del salurbal_df[ 'stt-pmp-3' ]
del salurbal_df[ 'stt-pmp-4' ]
del salurbal_df[ 'stt-pmp-5' ]
del salurbal_df[ 'stt-pmp-6' ]

In [5]:
salurbal_df.shape

(1351, 20)

In [6]:
salurbal_df.dtypes

Country                   object
L1Name                    object
L2Namev2                  object
BECNURBPTCHL2              int64
BECPTCHDENSL2            float64
BECAWMNSHPINDXL2         float64
BECAWMNNNGHL2            float64
BECEFFMESHSIZEL2         float64
BECAWAVGPTCHAREAL2       float64
BECAWEDGDENSL2           float64
BECPRSBRTL2                int64
BECPRSSUBWAYL2             int64
BECADCRCTYAVGL2          float64
BECADINTDENSL2           float64
BECADSTTDENSL2           float64
BECADSTTPNODEAVGL2       float64
BECADSTTLGAVGL2          float64
BECURBTRVDELAYINDEXL2    float64
urb-perfil                 int64
stt-perfil                 int64
dtype: object

In [7]:
salurbal_df.head()

Unnamed: 0,Country,L1Name,L2Namev2,BECNURBPTCHL2,BECPTCHDENSL2,BECAWMNSHPINDXL2,BECAWMNNNGHL2,BECEFFMESHSIZEL2,BECAWAVGPTCHAREAL2,BECAWEDGDENSL2,BECPRSBRTL2,BECPRSSUBWAYL2,BECADCRCTYAVGL2,BECADINTDENSL2,BECADSTTDENSL2,BECADSTTPNODEAVGL2,BECADSTTLGAVGL2,BECURBTRVDELAYINDEXL2,urb-perfil,stt-perfil
0,Argentina,Bahia Blanca,Bahía Blanca,487,0.2131,5.1337,76.6491,77.6599,2644.8742,0.420828,0,0,1.032761,3.672485,939.287799,3.411006,144.004054,0.104629,1,6
1,Argentina,Buenos Aires,CABA - Comuna 12,1,0.0642,1.4735,,1557.18,1557.18,14.988633,1,1,1.010627,102.376535,18590.830836,3.619105,99.89161,0.168965,6,5
2,Argentina,Buenos Aires,CABA - Comuna 13,13,0.8836,2.5395,60.0462,1264.0834,1359.9129,24.739527,1,1,1.026108,90.571453,16060.259206,3.473831,98.454629,0.240852,2,5
3,Argentina,Buenos Aires,CABA - Comuna 4,8,0.3611,2.2122,67.1673,1990.2022,2091.1138,18.119355,1,1,1.027101,79.577418,14502.014548,3.361758,104.423094,0.15642,2,5
4,Argentina,Buenos Aires,CABA - Comuna 6,1,0.146,1.2914,,684.9,684.9,19.798511,0,1,1.005764,83.147041,15379.25856,3.525963,106.14551,0.248183,6,5


# Data Cleaning

In [8]:
# Crea una columna contando la cantidad de valores faltantes
salurbal_df[ 'missings' ] = salurbal_df.isnull().sum( axis = 1 )

In [9]:
salurbal_df[ 'missings' ].value_counts( dropna = False )

0    1294
1      57
Name: missings, dtype: int64

In [10]:
del salurbal_df[ 'missings' ]

In [11]:
#features = salurbal_df.columns[ 2:-2 ].tolist()

In [12]:
salurbal_df.rename( columns = { 'stt-perfil' : 'TR_PROFILE', 'urb-perfil' : 'UL_PROFILE' }, inplace = True )

In [18]:
[ feature.replace( dataset_sufix, '' ) for feature in salurbal_df.columns.tolist() if feature.endswith( dataset_sufix ) ]

['BECNURBPTCH',
 'BECPTCHDENS',
 'BECAWMNSHPINDX',
 'BECAWMNNNGH',
 'BECEFFMESHSIZE',
 'BECAWAVGPTCHAREA',
 'BECAWEDGDENS',
 'BECPRSBRT',
 'BECPRSSUBWAY',
 'BECADCRCTYAVG',
 'BECADINTDENS',
 'BECADSTTDENS',
 'BECADSTTPNODEAVG',
 'BECADSTTLGAVG',
 'BECURBTRVDELAYINDEX']

In [19]:
ul_features = [ 'BECAWMNSHPINDX', 'BECEFFMESHSIZE', 'BECAWMNNNGH', 'BECAWEDGDENS', 'BECPTCHDENS', 'BECAWAVGPTCHAREA' ]

In [20]:
tr_features = [ 'BECADSTTDENS', 'BECADSTTLGAVG', 'BECADINTDENS', 'BECADCRCTYAVG', 'BECADSTTPNODEAVG' ]

In [21]:
salurbal_df.head()

Unnamed: 0,Country,L1Name,L2Namev2,BECNURBPTCHL2,BECPTCHDENSL2,BECAWMNSHPINDXL2,BECAWMNNNGHL2,BECEFFMESHSIZEL2,BECAWAVGPTCHAREAL2,BECAWEDGDENSL2,BECPRSBRTL2,BECPRSSUBWAYL2,BECADCRCTYAVGL2,BECADINTDENSL2,BECADSTTDENSL2,BECADSTTPNODEAVGL2,BECADSTTLGAVGL2,BECURBTRVDELAYINDEXL2,UL_PROFILE,TR_PROFILE
0,Argentina,Bahia Blanca,Bahía Blanca,487,0.2131,5.1337,76.6491,77.6599,2644.8742,0.420828,0,0,1.032761,3.672485,939.287799,3.411006,144.004054,0.104629,1,6
1,Argentina,Buenos Aires,CABA - Comuna 12,1,0.0642,1.4735,,1557.18,1557.18,14.988633,1,1,1.010627,102.376535,18590.830836,3.619105,99.89161,0.168965,6,5
2,Argentina,Buenos Aires,CABA - Comuna 13,13,0.8836,2.5395,60.0462,1264.0834,1359.9129,24.739527,1,1,1.026108,90.571453,16060.259206,3.473831,98.454629,0.240852,2,5
3,Argentina,Buenos Aires,CABA - Comuna 4,8,0.3611,2.2122,67.1673,1990.2022,2091.1138,18.119355,1,1,1.027101,79.577418,14502.014548,3.361758,104.423094,0.15642,2,5
4,Argentina,Buenos Aires,CABA - Comuna 6,1,0.146,1.2914,,684.9,684.9,19.798511,0,1,1.005764,83.147041,15379.25856,3.525963,106.14551,0.248183,6,5


# Data Exploration

In [None]:
sns.pairplot( salurbal_df, vars = ul_features, hue = 'UL_PROFILE' )

In [None]:
sns.pairplot( salurbal_df, vars = tr_features, hue = 'TR_PROFILE' )

# Geocoding

In [None]:
GOOGLE_API = os.environ[ 'GOOGLE_API' ]
URL = 'https://maps.googleapis.com/maps/api/geocode/json'

In [None]:
for index, row in salurbal_df.iterrows():
    try:
        #print( index )
        address = row[ 'L2Namev2' ] + ', ' + row[ 'L1Name' ] + ', ' + row[ 'Country' ]
        #print( address )
        data = requests.get( URL, params = { 'key' : GOOGLE_API, 'address' : address } ).json()
        lat = data[ 'results' ][ 0 ][ 'geometry' ][ 'location' ][ 'lat' ]
        lng = data[ 'results' ][ 0 ][ 'geometry' ][ 'location' ][ 'lng' ]
        salurbal_df.loc[ index, 'CentLatitude' ] = lat
        salurbal_df.loc[ index, 'CentLongitude' ] = lng
    except IndexError:
        pass
    except TypeError:
        pass

salurbal_geo_df = pd.read_excel( '../data/SALURBAL_L2_geocoding.xlsx' )

salurbal_geo_df.shape

salurbal_df = salurbal_df.merge( salurbal_geo_df[ [ 'Country', 'L2Namev2', 'CentLongitude', 'CentLatitude' ] ], on = [ 'Country', 'L2Namev2' ], how = 'left' )

In [None]:
salurbal_df.head()

In [None]:
salurbal_df.shape

In [None]:
salurbal_df[ 'CentLongitude' ].isnull().sum()

# Complementary Clustering

### K-Means

kmeans_pipeline = Pipeline( [
    ( 'scaling', StandardScaler() ),
    ( 'clustering', KMeans( n_clusters = 5, n_jobs = -1, random_state = 1 ) )
] )

kmeans_mbe_salurbal = kmeans_pipeline.fit_predict( salurbal_df[ mbe_features ] )

for i in range( kmeans_mbe_salurbal.shape[ 0 ] ):
    kmeans_mbe_salurbal[ i ] = kmeans_mbe_salurbal[ i ] + 1

kmeans_mt_salurbal = kmeans_pipeline.fit_predict( salurbal_df[ mt_features ] )

for i in range( kmeans_mt_salurbal.shape[ 0 ] ):
    kmeans_mt_salurbal[ i ] = kmeans_mt_salurbal[ i ] + 1

salurbal_df[ 'MBE_CLUSTER_KMEANS' ] = kmeans_mbe_salurbal

salurbal_df[ 'MT_CLUSTER_KMEANS' ] = kmeans_mt_salurbal

salurbal_df[ 'MBE_CLUSTER_KMEANS' ].value_counts( dropna = False )

salurbal_df[ 'MT_CLUSTER_KMEANS' ].value_counts( dropna = False )

sns.pairplot( salurbal_df, vars = mbe_features, hue = 'MBE_CLUSTER_KMEANS' )

sns.pairplot( salurbal_df, vars = mt_features, hue = 'MT_CLUSTER_KMEANS' )

### Spectral Clustering

spectral_pipeline = Pipeline( [
    ( 'scaling', StandardScaler() ),
    ( 'clustering', SpectralClustering( n_clusters = 5, n_jobs = -1, random_state = 1 ) )
] )

spectral_mbe_salurbal = spectral_pipeline.fit_predict( salurbal_df[ mbe_features ] )

for i in range( spectral_mbe_salurbal.shape[ 0 ] ):
    spectral_mbe_salurbal[ i ] = spectral_mbe_salurbal[ i ] + 1

spectral_mt_salurbal = spectral_pipeline.fit_predict( salurbal_df[ mt_features ] )

for i in range( spectral_mt_salurbal.shape[ 0 ] ):
    spectral_mt_salurbal[ i ] = spectral_mt_salurbal[ i ] + 1

salurbal_df[ 'MBE_CLUSTER_SPECTRAL' ] = spectral_mbe_salurbal

salurbal_df[ 'MT_CLUSTER_SPECTRAL' ] = spectral_mt_salurbal

salurbal_df[ 'MBE_CLUSTER_SPECTRAL' ].value_counts( dropna = False )

salurbal_df[ 'MT_CLUSTER_SPECTRAL' ].value_counts( dropna = False )

sns.pairplot( salurbal_df, vars = mbe_features, hue = 'MBE_CLUSTER_SPECTRAL' )

sns.pairplot( salurbal_df, vars = mt_features, hue = 'MT_CLUSTER_SPECTRAL' )

# Dimensionality Reduction

### T-SNE

In [None]:
tsne_pipeline = Pipeline( [
    ( 'scaling', StandardScaler() ),
    ( 'dimred', TSNE( n_iter = 2000, random_state = 1 ) )
] )

In [None]:
tsne_ul_salurbal = tsne_pipeline.fit_transform( salurbal_df[ ul_features ] )

In [None]:
tsne_tr_salurbal = tsne_pipeline.fit_transform( salurbal_df[ tr_features ] )

In [None]:
salurbal_df[ 'TSNE_UL_X' ] = tsne_ul_salurbal[ :, 0 ]
salurbal_df[ 'TSNE_UL_Y' ] = tsne_ul_salurbal[ :, 1 ]

salurbal_df[ 'TSNE_TR_X' ] = tsne_tr_salurbal[ :, 0 ]
salurbal_df[ 'TSNE_TR_Y' ] = tsne_tr_salurbal[ :, 1 ]

### PCA

In [None]:
pca_pipeline = Pipeline( [
    ( 'scaling', StandardScaler() ),
    ( 'dimred', PCA( n_components = 2, random_state = 1 ) )
] )

In [None]:
pca_ul_salurbal = pca_pipeline.fit_transform( salurbal_df[ ul_features ] )
pca_pipeline.named_steps[ 'dimred' ].explained_variance_ratio_

In [None]:
pca_tr_salurbal = pca_pipeline.fit_transform( salurbal_df[ tr_features ] )
pca_pipeline.named_steps[ 'dimred' ].explained_variance_ratio_

In [None]:
salurbal_df[ 'PCA_UL_X' ] = pca_ul_salurbal[ :, 0 ]
salurbal_df[ 'PCA_UL_Y' ] = pca_ul_salurbal[ :, 1 ]

salurbal_df[ 'PCA_TR_X' ] = pca_tr_salurbal[ :, 0 ]
salurbal_df[ 'PCA_TR_Y' ] = pca_tr_salurbal[ :, 1 ]

### UMAP

In [None]:
umap_pipeline = Pipeline( [
    ( 'scaling', StandardScaler() ),
    ( 'dimred', umap.UMAP() )
] )

In [None]:
umap_ul_salurbal = umap_pipeline.fit_transform( salurbal_df[ ul_features ] )

In [None]:
umap_tr_salurbal = umap_pipeline.fit_transform( salurbal_df[ tr_features ] )

In [None]:
salurbal_df[ 'UMAP_UL_X' ] = umap_ul_salurbal[ :, 0 ]
salurbal_df[ 'UMAP_UL_Y' ] = umap_ul_salurbal[ :, 1 ]

salurbal_df[ 'UMAP_TR_X' ] = umap_tr_salurbal[ :, 0 ]
salurbal_df[ 'UMAP_TR_Y' ] = umap_tr_salurbal[ :, 1 ]

# Visualization

In [None]:
f, ( ax1, ax2, ax3 ) = plt.subplots( 1, 3,  figsize = ( 25, 7 ) )

model = 'UL'
clustering = ''
methods = { 'PCA' : ax1, 'TSNE' : ax2, 'UMAP' : ax3 }
for method in methods:
    for c in range( 1, 7 ):
        methods[ method ].scatter( salurbal_df.loc[ salurbal_df[ model + '_PROFILE' + clustering ] == c ][ method + '_' + model + '_X' ], salurbal_df.loc[ salurbal_df[ model + '_PROFILE' + clustering ] == c ][ method + '_' + model + '_Y' ], label = str( c ), edgecolor = 'black', alpha = 0.5 )
    methods[ method ].set_title( method )
    methods[ method ].legend()
plt.show()

f, ( ax1, ax2, ax3 ) = plt.subplots( 1, 3,  figsize = ( 25, 7 ) )

model = 'MBE'
clustering = '_KMEANS'
methods = { 'PCA' : ax1, 'TSNE' : ax2, 'UMAP' : ax3 }
for method in methods:
    for c in range( 1, 6 ):
        methods[ method ].scatter( salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_X' ], salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_Y' ], label = str( c ), edgecolor = 'black', alpha = 0.5 )
    methods[ method ].set_title( method )
    methods[ method ].legend()
plt.show()

f, ( ax1, ax2, ax3 ) = plt.subplots( 1, 3,  figsize = ( 25, 7 ) )

model = 'MBE'
clustering = '_SPECTRAL'
methods = { 'PCA' : ax1, 'TSNE' : ax2, 'UMAP' : ax3 }
for method in methods:
    for c in range( 1, 6 ):
        methods[ method ].scatter( salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_X' ], salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_Y' ], label = str( c ), edgecolor = 'black', alpha = 0.5 )
    methods[ method ].set_title( method )
    methods[ method ].legend()
plt.show()

In [None]:
f, ( ax1, ax2, ax3 ) = plt.subplots( 1, 3,  figsize = ( 25, 7 ) )

model = 'TR'
clustering = ''
methods = { 'PCA' : ax1, 'TSNE' : ax2, 'UMAP' : ax3 }
for method in methods:
    for c in range( 1, 7 ):
        methods[ method ].scatter( salurbal_df.loc[ salurbal_df[ model + '_PROFILE' + clustering ] == c ][ method + '_' + model + '_X' ], salurbal_df.loc[ salurbal_df[ model + '_PROFILE' + clustering ] == c ][ method + '_' + model + '_Y' ], label = str( c ), edgecolor = 'black', alpha = 0.5 )
    methods[ method ].set_title( method )
    methods[ method ].legend()
plt.show()

f, ( ax1, ax2, ax3 ) = plt.subplots( 1, 3,  figsize = ( 25, 7 ) )

model = 'MT'
clustering = '_KMEANS'
methods = { 'PCA' : ax1, 'TSNE' : ax2, 'UMAP' : ax3 }
for method in methods:
    for c in range( 1, 6 ):
        methods[ method ].scatter( salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_X' ], salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_Y' ], label = str( c ), edgecolor = 'black', alpha = 0.5 )
    methods[ method ].set_title( method )
    methods[ method ].legend()
plt.show()

f, ( ax1, ax2, ax3 ) = plt.subplots( 1, 3,  figsize = ( 25, 7 ) )

model = 'MT'
clustering = '_SPECTRAL'
methods = { 'PCA' : ax1, 'TSNE' : ax2, 'UMAP' : ax3 }
for method in methods:
    for c in range( 1, 6 ):
        methods[ method ].scatter( salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_X' ], salurbal_df.loc[ salurbal_df[ model + '_CLUSTER' + clustering ] == c ][ method + '_' + model + '_Y' ], label = str( c ), edgecolor = 'black', alpha = 0.5 )
    methods[ method ].set_title( method )
    methods[ method ].legend()
plt.show()

In [None]:
salurbal_df.head()

In [None]:
salurbal_df.to_csv( '../data/modelos_l2.csv', sep = ';', decimal = ',', index = False )

# Explanations

In [None]:
import lime
import lime.lime_tabular

In [None]:
import sklearn
import sklearn.datasets
import sklearn.ensemble
import numpy as np
import lime
import lime.lime_tabular
from __future__ import print_function
np.random.seed(1)

In [None]:
iris = sklearn.datasets.load_iris()

In [None]:
train, test, labels_train, labels_test = sklearn.model_selection.train_test_split(iris.data, iris.target, train_size=0.80)

In [None]:
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)
rf.fit(train, labels_train)

In [None]:
sklearn.metrics.accuracy_score(labels_test, rf.predict(test))

In [None]:
iris.target_names

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(train, feature_names=iris.feature_names, class_names=iris.target_names, discretize_continuous=True)

In [None]:
i = np.random.randint(0, test.shape[0])
exp = explainer.explain_instance(test[i], rf.predict_proba, num_features=2, top_labels=1)

In [None]:
exp.show_in_notebook(show_table=True, show_all=False)

In [None]:
test.shape

In [None]:
#salurbal_df[ [ 'MT_PROB1', 'MT_PROB2', 'MT_PROB3', 'MT_PROB4', 'MT_PROB5' ] ].sum( axis = 1 )

In [None]:
salurbal_df[ mt_features ].dtypes

In [None]:
salurbal_mx = salurbal_df[ mt_features ].values
salurbal_mx.shape

In [None]:
salurbal_mx_probs = lambda x: salurbal_df[ [ 'MT_PROB1', 'MT_PROB2', 'MT_PROB3', 'MT_PROB4', 'MT_PROB5' ] ].values
salurbal_mx_probs

In [None]:
salurbal_mx_probs( 1 )

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer( salurbal_mx, feature_names = mt_features, class_names = [ '1' , '2', '3', '4', '5' ] )

In [None]:
i = np.random.randint( 0, salurbal_mx.shape[ 0 ] )
i

In [None]:
exp = explainer.explain_instance( salurbal_mx[ i ], salurbal_mx_probs, num_features = 7 )