# Kmeans Modelos Seleccionados

## 1. Importar librerías

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
from sklearn.cluster import KMeans
from random import sample
from sklearn.ensemble import IsolationForest
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import shap

#Mostrar todas las columnas de un dataframe
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## 2. Importación de data

In [2]:
#Seleccionar ruta de la data
dataframe = pd.read_csv(r'C:\GIT_Cumplimiento\epic\EPIC003\data\modeling\03_train.csv', encoding='latin')
dataframe.shape

(37639, 13)

## 3. Modelo

In [3]:
#REviso columnas
dataframe.columns

Index(['Unnamed: 0', 'PERIODO', 'CODCLAVECIC', 'NBRCLIORDENANTE', 'SEGMENTO',
       'MTO_TRANSF', 'CTD_OPE', 'FLG_PEP', 'FLG_PROF', 'FLG_PAR', 'FLG_PERFIL',
       'CTDEVAL', 'FLG_PAIS'],
      dtype='object')

### Model 1: Variables modelo estándar (Todas las variables)

In [4]:
#Selecciono variables
ds1=dataframe[['MTO_TRANSF', 'CTD_OPE', 'FLG_PEP', 'FLG_PROF', 'FLG_PERFIL','CTDEVAL', 'FLG_PAIS']]
#Estandarizo variables
data1_cluster = MinMaxScaler().fit_transform(ds1)

#Calculo silueta final  k=13
ki=13
outputKMeans = KMeans(n_clusters = ki, init='k-means++', random_state = 7).fit(data1_cluster)
scoreSilhoutte = metrics.silhouette_score(data1_cluster, outputKMeans.labels_, metric='euclidean')
print(" k=", ki," SC=", scoreSilhoutte)

 k= 13  SC= 0.9763299901994965


In [6]:
#Entreno modelo
clustering1=KMeans(n_clusters=13,init='k-means++', random_state = 7)
clustering1.fit(data1_cluster)
#Asigno etiquetas de cluster como variable
dataframe1=dataframe.copy()
dataframe1['N_Cluster']=clustering1.labels_
#Valido creación de variable con clusters
dataframe1.head(5)

Unnamed: 0.1,Unnamed: 0,PERIODO,CODCLAVECIC,NBRCLIORDENANTE,SEGMENTO,MTO_TRANSF,CTD_OPE,FLG_PEP,FLG_PROF,FLG_PAR,FLG_PERFIL,CTDEVAL,FLG_PAIS,N_Cluster
0,0,202206,812,CORTES DE POLAR SANDRA MIRIAN,ENALTA,100000.0,1,0,1,2,0,0,1,3
1,1,202204,898,FORNO CASTRO-POZO XENNIA MARIA,ENALTA,5416.06,1,0,0,0,0,0,0,1
2,2,202207,961,INOPE MANTERO CARLOS ALBERTO,ENALTA,1218.52,1,0,0,0,0,1,0,1
3,3,202209,961,INOPE MANTERO CARLOS ALBERTO,ENALTA,3703.35,1,0,0,2,0,1,1,2
4,6,202209,1604,MARIN ALCALDE WILDER JUAN,EXCLUSIVO,2906.34,1,0,0,2,0,0,1,2


In [7]:
#Valido número de casos por cluster
dataframe1.groupby('N_Cluster').N_Cluster.count()

N_Cluster
0      4299
1     14255
2      8927
3      7681
4       872
5       454
6       405
7       282
8       187
9       156
10       41
11       44
12       36
Name: N_Cluster, dtype: int64

In [15]:
dataframe2.groupby('N_Cluster').N_Cluster.count().to_clipboard()

In [8]:
#Perfilamiento media por variable por cluster
dataframe1.groupby('N_Cluster').mean().round(3)

Unnamed: 0_level_0,Unnamed: 0,PERIODO,CODCLAVECIC,MTO_TRANSF,CTD_OPE,FLG_PEP,FLG_PROF,FLG_PAR,FLG_PERFIL,CTDEVAL,FLG_PAIS
N_Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,27729.258,202206.628,8255207.0,30342.686,1.383,0.0,1.0,0.0,0.0,0.512,0.0
1,24549.366,202206.562,6649207.0,24650.373,1.35,0.0,0.0,0.0,0.0,0.796,0.0
2,26186.814,202206.585,7144770.0,29643.758,1.309,0.0,0.0,1.916,0.0,0.349,1.0
3,32746.495,202206.613,10432680.0,24233.416,1.322,0.0,1.0,1.956,0.0,0.148,1.0
4,23673.383,202206.586,6361951.0,108206.659,2.073,0.0,0.0,0.0,1.0,1.272,0.0
5,25952.37,202206.542,7061603.0,70331.459,2.115,0.0,0.0,1.85,1.0,0.648,1.0
6,33022.262,202206.454,10456170.0,31428.259,2.047,0.0,1.0,1.936,1.0,0.18,1.0
7,12899.149,202206.762,1848288.0,19289.151,1.273,1.0,0.0,0.0,0.0,5.468,0.0
8,24393.385,202206.487,6880010.0,53095.626,2.15,0.0,1.0,0.0,1.0,0.711,0.0
9,17945.09,202206.41,3176164.0,27845.52,1.276,1.0,0.0,1.936,0.0,1.096,1.0


In [9]:
#Perfilamiento desviacióne estándar por variable por cluster
dataframe1.groupby('N_Cluster').std().round(3)

Unnamed: 0_level_0,Unnamed: 0,PERIODO,CODCLAVECIC,MTO_TRANSF,CTD_OPE,FLG_PEP,FLG_PROF,FLG_PAR,FLG_PERFIL,CTDEVAL,FLG_PAIS
N_Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,15932.179,1.721,8410698.41,494173.346,0.893,0.0,0.0,0.0,0.0,2.609,0.0
1,15693.392,1.722,7408430.564,190008.526,0.922,0.0,0.0,0.0,0.0,2.891,0.0
2,15193.957,1.717,7354833.419,92479.763,0.907,0.0,0.0,0.277,0.0,1.815,0.0
3,14704.753,1.718,8505252.579,94318.559,0.819,0.0,0.0,0.205,0.0,0.993,0.0
4,15812.143,1.685,7424540.302,597037.022,1.47,0.0,0.0,0.0,0.0,4.052,0.0
5,15189.822,1.746,7344182.052,232593.415,1.502,0.0,0.0,0.357,0.0,2.566,0.0
6,14444.039,1.705,8285163.436,61950.642,1.889,0.0,0.0,0.245,0.0,1.024,0.0
7,9788.752,1.711,2456990.336,54352.24,0.59,0.0,0.0,0.0,0.0,20.206,0.0
8,16313.55,1.711,8071709.977,205642.104,1.502,0.0,0.0,0.0,0.0,2.376,0.0
9,11365.453,1.741,3628067.349,67745.929,0.741,0.0,0.0,0.246,0.0,2.896,0.0


In [10]:
#Exporto tabla con variable del modelo a excel
dataframe1.to_excel(r'C:\GIT_Cumplimiento\epic\EPIC003\reports\others\KM1_13seg.xlsx',index=False,header=True)

### Model 2: Variables modelo estándar sin variable CTDEVAL

In [26]:
#Selecciono variables
ds2=dataframe[['MTO_TRANSF', 'CTD_OPE', 'FLG_PEP', 'FLG_PROF', 'FLG_PERFIL','FLG_PAIS']]
#Estandarizo variables
data2_cluster = MinMaxScaler().fit_transform(ds2)
#Selecciono cluster silueta final
ki = 14
#Calculo silueta
outputKMeans = KMeans(n_clusters = ki, init='k-means++', random_state = 7).fit(data2_cluster)
scoreSilhoutte = metrics.silhouette_score(data2_cluster, outputKMeans.labels_, metric='euclidean')
print(" k=", ki," SC=", scoreSilhoutte)

 k= 14  SC= 0.9812538774004375


In [27]:
#Entreno modelo
clustering2=KMeans(n_clusters=14,init='k-means++', random_state = 7)
clustering2.fit(data2_cluster)
#Asigno etiquetas de cluster como variable
dataframe2=dataframe.copy()
dataframe2['N_Cluster']=clustering2.labels_
#Valido creación de variable con clusters
dataframe2.head(5)

Unnamed: 0.1,Unnamed: 0,PERIODO,CODCLAVECIC,NBRCLIORDENANTE,SEGMENTO,MTO_TRANSF,CTD_OPE,FLG_PEP,FLG_PROF,FLG_PAR,FLG_PERFIL,CTDEVAL,FLG_PAIS,N_Cluster
0,0,202206,812,CORTES DE POLAR SANDRA MIRIAN,ENALTA,100000.0,1,0,1,2,0,0,1,1
1,1,202204,898,FORNO CASTRO-POZO XENNIA MARIA,ENALTA,5416.06,1,0,0,0,0,0,0,2
2,2,202207,961,INOPE MANTERO CARLOS ALBERTO,ENALTA,1218.52,1,0,0,0,0,1,0,2
3,3,202209,961,INOPE MANTERO CARLOS ALBERTO,ENALTA,3703.35,1,0,0,2,0,1,1,0
4,6,202209,1604,MARIN ALCALDE WILDER JUAN,EXCLUSIVO,2906.34,1,0,0,2,0,0,1,0


In [28]:
#Valido número de casos por cluster
dataframe2.groupby('N_Cluster').N_Cluster.count()

N_Cluster
0      8927
1      7681
2     14255
3      4299
4       282
5       872
6       454
7       405
8       187
9       156
10       41
11       40
12       27
13       13
Name: N_Cluster, dtype: int64

In [29]:
#Perfilamiento media por variable por cluster
dataframe2.groupby('N_Cluster').mean().round(3)

Unnamed: 0_level_0,Unnamed: 0,PERIODO,CODCLAVECIC,MTO_TRANSF,CTD_OPE,FLG_PEP,FLG_PROF,FLG_PAR,FLG_PERFIL,CTDEVAL,FLG_PAIS
N_Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,26186.814,202206.585,7144770.0,29643.758,1.309,0.0,0.0,1.916,0.0,0.349,1.0
1,32746.495,202206.613,10432680.0,24233.416,1.322,0.0,1.0,1.956,0.0,0.148,1.0
2,24549.366,202206.562,6649207.0,24650.373,1.35,0.0,0.0,0.0,0.0,0.796,0.0
3,27729.258,202206.628,8255207.0,30342.686,1.383,0.0,1.0,0.0,0.0,0.512,0.0
4,12899.149,202206.762,1848288.0,19289.151,1.273,1.0,0.0,0.0,0.0,5.468,0.0
5,23673.383,202206.586,6361951.0,108206.659,2.073,0.0,0.0,0.0,1.0,1.272,0.0
6,25952.37,202206.542,7061603.0,70331.459,2.115,0.0,0.0,1.85,1.0,0.648,1.0
7,33022.262,202206.454,10456170.0,31428.259,2.047,0.0,1.0,1.936,1.0,0.18,1.0
8,24393.385,202206.487,6880010.0,53095.626,2.15,0.0,1.0,0.0,1.0,0.711,0.0
9,17945.09,202206.41,3176164.0,27845.52,1.276,1.0,0.0,1.936,0.0,1.096,1.0


In [30]:
#Perfilamiento desviacióne estándar por variable por cluster
dataframe2.groupby('N_Cluster').std().round(3)

Unnamed: 0_level_0,Unnamed: 0,PERIODO,CODCLAVECIC,MTO_TRANSF,CTD_OPE,FLG_PEP,FLG_PROF,FLG_PAR,FLG_PERFIL,CTDEVAL,FLG_PAIS
N_Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,15193.957,1.717,7354833.419,92479.763,0.907,0.0,0.0,0.277,0.0,1.815,0.0
1,14704.753,1.718,8505252.579,94318.559,0.819,0.0,0.0,0.205,0.0,0.993,0.0
2,15693.392,1.722,7408430.564,190008.526,0.922,0.0,0.0,0.0,0.0,2.891,0.0
3,15932.179,1.721,8410698.41,494173.346,0.893,0.0,0.0,0.0,0.0,2.609,0.0
4,9788.752,1.711,2456990.336,54352.24,0.59,0.0,0.0,0.0,0.0,20.206,0.0
5,15812.143,1.685,7424540.302,597037.022,1.47,0.0,0.0,0.0,0.0,4.052,0.0
6,15189.822,1.746,7344182.052,232593.415,1.502,0.0,0.0,0.357,0.0,2.566,0.0
7,14444.039,1.705,8285163.436,61950.642,1.889,0.0,0.0,0.245,0.0,1.024,0.0
8,16313.55,1.711,8071709.977,205642.104,1.502,0.0,0.0,0.0,0.0,2.376,0.0
9,11365.453,1.741,3628067.349,67745.929,0.741,0.0,0.0,0.246,0.0,2.896,0.0


In [31]:
#Exporto tabla con variable del modelo a excel
dataframe2.to_excel(r'C:\GIT_Cumplimiento\epic\EPIC003\reports\others\KM2_13seg.xlsx',index=False,header=True)

### Model 3: Variables modelo estándar sin variables CTDOPE

In [19]:
#Selecciono variables
ds3=dataframe[['MTO_TRANSF', 'CTDEVAL', 'FLG_PEP', 'FLG_PROF', 'FLG_PERFIL','FLG_PAIS']]
#Estandarizo variables
data3_cluster = MinMaxScaler().fit_transform(ds3)
#Selecciono números de clusters silueta final
k = 14
#Calculo silueta
outputKMeans = KMeans(n_clusters = ki, init='k-means++', random_state = 7).fit(data3_cluster)
scoreSilhoutte = metrics.silhouette_score(data3_cluster, outputKMeans.labels_, metric='euclidean')
print(" k=", ki," SC=", scoreSilhoutte)

 k= 13  SC= 0.991795090764557


In [32]:
#Entreno modelo
clustering3=KMeans(n_clusters=14,init='k-means++', random_state = 7)
clustering3.fit(data3_cluster)
#Asigno etiquetas de cluster como variable
dataframe3=dataframe.copy()
dataframe3['N_Cluster']=clustering3.labels_
#Valido creación de variable con clusters
dataframe3.head(5)

Unnamed: 0.1,Unnamed: 0,PERIODO,CODCLAVECIC,NBRCLIORDENANTE,SEGMENTO,MTO_TRANSF,CTD_OPE,FLG_PEP,FLG_PROF,FLG_PAR,FLG_PERFIL,CTDEVAL,FLG_PAIS,N_Cluster
0,0,202206,812,CORTES DE POLAR SANDRA MIRIAN,ENALTA,100000.0,1,0,1,2,0,0,1,2
1,1,202204,898,FORNO CASTRO-POZO XENNIA MARIA,ENALTA,5416.06,1,0,0,0,0,0,0,1
2,2,202207,961,INOPE MANTERO CARLOS ALBERTO,ENALTA,1218.52,1,0,0,0,0,1,0,1
3,3,202209,961,INOPE MANTERO CARLOS ALBERTO,ENALTA,3703.35,1,0,0,2,0,1,1,0
4,6,202209,1604,MARIN ALCALDE WILDER JUAN,EXCLUSIVO,2906.34,1,0,0,2,0,0,1,0


In [33]:
#Valido número de casos por cluster
dataframe3.groupby('N_Cluster').N_Cluster.count()

N_Cluster
0      8927
1     14255
2      7681
3      4299
4       872
5       454
6       405
7       282
8       187
9       156
10       41
11       40
12       27
13       13
Name: N_Cluster, dtype: int64

In [34]:
#Perfilamiento media por variable por cluster
dataframe3.groupby('N_Cluster').mean().round(3)

Unnamed: 0_level_0,Unnamed: 0,PERIODO,CODCLAVECIC,MTO_TRANSF,CTD_OPE,FLG_PEP,FLG_PROF,FLG_PAR,FLG_PERFIL,CTDEVAL,FLG_PAIS
N_Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,26186.814,202206.585,7144770.0,29643.758,1.309,0.0,0.0,1.916,0.0,0.349,1.0
1,24549.366,202206.562,6649207.0,24650.373,1.35,0.0,0.0,0.0,0.0,0.796,0.0
2,32746.495,202206.613,10432680.0,24233.416,1.322,0.0,1.0,1.956,0.0,0.148,1.0
3,27729.258,202206.628,8255207.0,30342.686,1.383,0.0,1.0,0.0,0.0,0.512,0.0
4,23673.383,202206.586,6361951.0,108206.659,2.073,0.0,0.0,0.0,1.0,1.272,0.0
5,25952.37,202206.542,7061603.0,70331.459,2.115,0.0,0.0,1.85,1.0,0.648,1.0
6,33022.262,202206.454,10456170.0,31428.259,2.047,0.0,1.0,1.936,1.0,0.18,1.0
7,12899.149,202206.762,1848288.0,19289.151,1.273,1.0,0.0,0.0,0.0,5.468,0.0
8,24393.385,202206.487,6880010.0,53095.626,2.15,0.0,1.0,0.0,1.0,0.711,0.0
9,17945.09,202206.41,3176164.0,27845.52,1.276,1.0,0.0,1.936,0.0,1.096,1.0


In [35]:
#Perfilamiento desviacióne estándar por variable por cluster
dataframe3.groupby('N_Cluster').std().round(3)

Unnamed: 0_level_0,Unnamed: 0,PERIODO,CODCLAVECIC,MTO_TRANSF,CTD_OPE,FLG_PEP,FLG_PROF,FLG_PAR,FLG_PERFIL,CTDEVAL,FLG_PAIS
N_Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,15193.957,1.717,7354833.419,92479.763,0.907,0.0,0.0,0.277,0.0,1.815,0.0
1,15693.392,1.722,7408430.564,190008.526,0.922,0.0,0.0,0.0,0.0,2.891,0.0
2,14704.753,1.718,8505252.579,94318.559,0.819,0.0,0.0,0.205,0.0,0.993,0.0
3,15932.179,1.721,8410698.41,494173.346,0.893,0.0,0.0,0.0,0.0,2.609,0.0
4,15812.143,1.685,7424540.302,597037.022,1.47,0.0,0.0,0.0,0.0,4.052,0.0
5,15189.822,1.746,7344182.052,232593.415,1.502,0.0,0.0,0.357,0.0,2.566,0.0
6,14444.039,1.705,8285163.436,61950.642,1.889,0.0,0.0,0.245,0.0,1.024,0.0
7,9788.752,1.711,2456990.336,54352.24,0.59,0.0,0.0,0.0,0.0,20.206,0.0
8,16313.55,1.711,8071709.977,205642.104,1.502,0.0,0.0,0.0,0.0,2.376,0.0
9,11365.453,1.741,3628067.349,67745.929,0.741,0.0,0.0,0.246,0.0,2.896,0.0


In [36]:
#Exporto tabla con variable del modelo a excel
dataframe3.to_excel(r'C:\GIT_Cumplimiento\epic\EPIC003\reports\others\KM3_13seg.xlsx',index=False,header=True)