## Modelos IForest Seleccionado

## 1. Importación de librerías

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
from sklearn.cluster import KMeans
from random import sample
from sklearn.ensemble import IsolationForest
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import shap

#Mostrar todas las columnas de un dataframe
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## 2. Importación data

In [2]:
#Seleccionar ruta de la data
dataframe = pd.read_csv(r'C:\GIT_Cumplimiento\epic\EPIC003\data\modeling\03_train_v4.csv', encoding='latin')
dataframe.shape

(13516, 13)

## 3. Universo de riesgo

In [3]:
#Importo modelo kmeans final
import pickle 
loaded_KM = pickle.load(open(r'KMeans1_9seg_stairs.model', "rb"))

In [4]:
#Selecciono solo las variables que necesita el modelo
dataset=dataframe[['MTO_TRANSF', 'CTD_OPE', 'FLG_PEP', 'FLG_PROF', 'FLG_PERFIL','CTDEVAL', 'FLG_PAIS']]

#Estandarizo variables
dataset_cluster = MinMaxScaler().fit_transform(dataset)

In [5]:
#Aplico el modelo cargado sobre la data. Columna Outlier es la que tiene el resultado del modelo
dataframe['N_CLUSTER']=loaded_KM.predict(dataset_cluster)
dataframe.head()


Unnamed: 0.1,Unnamed: 0,PERIODO,CODCLAVECIC,NBRCLIORDENANTE,SEGMENTO,MTO_TRANSF,CTD_OPE,FLG_PEP,FLG_PROF,FLG_PAR,FLG_PERFIL,CTDEVAL,FLG_PAIS,N_CLUSTER
0,0,202206,812,CORTES DE POLAR SANDRA MIRIAN,ENALTA,100000.0,1,0,1,2,0,0,1,1
1,8,202209,1750,CONROY DE MARSANO MARIA TERESA,PRIVADA,17357.0,1,0,0,0,0,15,0,2
2,9,202204,1973,APARCANA HERRERA MARIELA VILMA,CONSUMO,13931.65,1,0,1,0,0,0,0,3
3,12,202204,2249,DANERI PREIS GUSTAVO,ENALTA,13275.0,1,0,1,0,0,5,0,3
4,14,202208,2374,FIEDLER VASQUEZ-MEJIA PAUL,ENALTA,10670.96,2,0,1,0,0,0,0,3


In [6]:
#Defino Universo basado en clusters de riesgo seleccionados
dataseg=dataframe[dataframe['N_CLUSTER'].isin([4,5,6,7,8])]
dataseg.shape

(1217, 14)

In [7]:
#Visualizo dataset
dataseg.head()

Unnamed: 0.1,Unnamed: 0,PERIODO,CODCLAVECIC,NBRCLIORDENANTE,SEGMENTO,MTO_TRANSF,CTD_OPE,FLG_PEP,FLG_PROF,FLG_PAR,FLG_PERFIL,CTDEVAL,FLG_PAIS,N_CLUSTER
8,39,202206,4319,SORIA SUAREZ ALCIDES WILLIAM,ENALTA,100000.0,1,0,0,0,1,0,0,4
14,62,202209,6083,CALMET PAPANICOLAU GONZALO,ENALTA,102653.0,3,0,1,0,1,1,0,6
15,64,202205,6420,ARAUJO ZAPATA MILAGRITOS FRANCISCA,EXCLUSIVO,20000.0,1,1,0,0,0,0,0,8
30,115,202208,9940,LEGUIA OREZZOLI JOAQUIN FELIPE,ENALTA,42096.52,3,0,0,0,1,1,0,4
51,184,202204,12810,RUBIN DE-COL JOSE ANTONIO,ENALTA,30000.0,1,0,1,0,1,0,0,6


## 3. Modelamiento

In [8]:
#Reviso columnas
dataseg.columns

Index(['Unnamed: 0', 'PERIODO', 'CODCLAVECIC', 'NBRCLIORDENANTE', 'SEGMENTO',
       'MTO_TRANSF', 'CTD_OPE', 'FLG_PEP', 'FLG_PROF', 'FLG_PAR', 'FLG_PERFIL',
       'CTDEVAL', 'FLG_PAIS', 'N_CLUSTER'],
      dtype='object')

In [9]:
#Variables
ds1=dataseg[['MTO_TRANSF', 'CTD_OPE', 'FLG_PEP', 'FLG_PROF', 'FLG_PERFIL','CTDEVAL', 'FLG_PAIS']]

#Modelo base IF
iforest1 = IsolationForest(n_estimators = 500, contamination=0.1,random_state=7)
iforest1.fit(ds1)

#score de silueta
datasetv1 = dataseg.copy()
datasetv1['IF_LABEL']=iforest1.predict(ds1)
scoreSilhoutte = metrics.silhouette_score(ds1, datasetv1['IF_LABEL'], metric='euclidean') 
print('Silueta:',scoreSilhoutte)

#Casos estimados
datasetv1.groupby(['PERIODO','IF_LABEL']).size()

Silueta: 0.5258252251435743


PERIODO  IF_LABEL
202204   -1           25
          1          166
202205   -1           21
          1          198
202206   -1           16
          1          182
202207   -1           16
          1          155
202208   -1           21
          1          208
202209   -1           23
          1          186
dtype: int64

In [10]:
datasetv1.groupby(['PERIODO','IF_LABEL']).size().to_clipboard()

In [11]:
#Reviso el total de outliers
datasetv1.groupby(['IF_LABEL']).size()

IF_LABEL
-1     122
 1    1095
dtype: int64

In [12]:
#Copio a excel tabla de medias por grupo
datasetv1.groupby('IF_LABEL').mean().round(3).to_clipboard(excel=True)
#Tabla de medias por grupo
datasetv1.groupby('IF_LABEL').mean().round(3)

Unnamed: 0_level_0,Unnamed: 0,PERIODO,CODCLAVECIC,MTO_TRANSF,CTD_OPE,FLG_PEP,FLG_PROF,FLG_PAR,FLG_PERFIL,CTDEVAL,FLG_PAIS,N_CLUSTER
IF_LABEL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-1,13780.975,202206.459,2547391.713,142254.452,3.115,0.582,0.189,0.844,0.492,13.09,0.492,6.631
1,25686.273,202206.547,7070712.661,48591.456,2.232,0.067,0.278,0.893,0.939,0.7,0.471,5.252


In [13]:
#Exporto a excel tabla scoreada
datasetv1.to_excel(r'C:\GIT_Cumplimiento\epic\EPIC003\reports\others\IForest_resultados_vf.xlsx',index=False,header=True)

In [15]:
#Guardar el modelo

import pickle
pickle.dump(iforest1, open('IF_EPIC003_alerta.model', 'wb'))

## 4. Test sobre mes de prueba

In [15]:
#Seleccionar ruta de la data
df_test = pd.read_csv(r'C:\GIT_Cumplimiento\epic\EPIC003\data\modeling\04_test.csv', encoding='latin')
df_test.shape

(6148, 13)

In [16]:
df_test.describe()

Unnamed: 0.1,Unnamed: 0,PERIODO,CODCLAVECIC,MTO_TRANSF,CTD_OPE,FLG_PEP,FLG_PROF,FLG_PAR,FLG_PERFIL,CTDEVAL,FLG_PAIS
count,6148.0,6148.0,6148.0,6148.0,6148.0,6148.0,6148.0,6148.0,6148.0,6148.0,6148.0
mean,27242.907775,202210.0,7863548.0,22947.91,1.371991,0.015777,0.354099,0.91298,0.040501,0.521958,0.471698
std,15721.590187,0.0,7992482.0,147845.2,0.904794,0.124624,0.478278,0.980901,0.197147,2.58388,0.499239
min,4.0,202210.0,1006.0,1000.36,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,13720.0,202210.0,1651027.0,2863.932,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,27261.5,202210.0,4882740.0,6379.36,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,40878.75,202210.0,12066540.0,15000.0,1.0,0.0,1.0,2.0,0.0,0.0,1.0
max,54557.0,202210.0,28062310.0,9300000.0,21.0,1.0,1.0,2.0,1.0,94.0,1.0


In [17]:
datatest=df_test.loc[(df_test['PERIODO']==202210) & (df_test['MTO_TRANSF']>10000) & (df_test['MTO_TRANSF']<500000)]
datatest.shape

(2021, 13)

In [18]:
#Selecciono solo las variables que necesita el modelo
dstest=datatest[['MTO_TRANSF', 'CTD_OPE', 'FLG_PEP', 'FLG_PROF', 'FLG_PERFIL','CTDEVAL', 'FLG_PAIS']]

#Estandarizo variables
datatest_cluster = MinMaxScaler().fit_transform(dstest)

#Aplico el modelo cargado sobre la data. Columna Outlier es la que tiene el resultado del modelo
datatest['N_CLUSTER']=loaded_KM.predict(datatest_cluster)
datatest.head()


Unnamed: 0.1,Unnamed: 0,PERIODO,CODCLAVECIC,NBRCLIORDENANTE,SEGMENTO,MTO_TRANSF,CTD_OPE,FLG_PEP,FLG_PROF,FLG_PAR,FLG_PERFIL,CTDEVAL,FLG_PAIS,N_CLUSTER
0,4,202210,1006,STOCKHOLM BARRIOS CHRISTIAN,ENALTA,100000.0,1,0,0,0,0,2,0,2
5,55,202210,5879,MARSANO CONROY CLAUDIA,PRIVADA,17948.0,1,0,0,0,0,8,0,2
9,108,202210,9133,VIVAS INGA SONIA ROSARIO,ENALTA,12000.0,1,0,0,0,0,3,0,2
14,132,202210,10636,DE-LA-PUENTE PARDO CARLOS ANTONIO,ENALTA,20000.0,1,0,1,2,0,0,1,1
19,171,202210,12500,ARENAS MEZA MARIA DEL ROSARIO,PRIVADA,150000.0,1,0,1,0,0,0,0,3


In [19]:
#Validoexistencia de clusters de riesgo
datatest['N_CLUSTER'].value_counts()

2    616
0    543
1    507
3    190
4     56
5     40
7     31
8     21
6     17
Name: N_CLUSTER, dtype: int64

In [20]:
#Defino Universo basado en clusters de riesgo seleccionados
datarisk=datatest[datatest['N_CLUSTER'].isin([4,5,6,7,8])]
datarisk.shape

(165, 14)

In [21]:
#Aplico IF sobre datarisk
loaded_IF = pickle.load(open(r'IF_EPIC003_alerta.model', "rb"))

dsrisk=datarisk[['MTO_TRANSF', 'CTD_OPE', 'FLG_PEP', 'FLG_PROF', 'FLG_PERFIL','CTDEVAL', 'FLG_PAIS']]

In [22]:
#Aplico el modelo cargado sobre la data. Columna Outlier es la que tiene el resultado del modelo

datarisk['OUTLIER']=loaded_IF.predict(dsrisk)

datarisk.head()

Unnamed: 0.1,Unnamed: 0,PERIODO,CODCLAVECIC,NBRCLIORDENANTE,SEGMENTO,MTO_TRANSF,CTD_OPE,FLG_PEP,FLG_PROF,FLG_PAR,FLG_PERFIL,CTDEVAL,FLG_PAIS,N_CLUSTER,OUTLIER
33,277,202210,21350,FLORES DIAZ AYDA LUZ,PEQ. EMPRESA,13308.94,1,0,1,2,1,0,1,7,1
54,438,202210,33547,VASQUEZ VELA ANDERSON,ENALTA,17434.23,3,0,1,1,1,0,1,7,1
75,637,202210,44432,WATSON BARBER CHARLES PHILLIP,PEQ. EMPRESA,10725.3,1,0,1,2,1,0,1,7,1
82,700,202210,45857,MERINO CABALLERO ALFREDO ABEL,ENALTA,32000.0,1,0,0,0,1,6,0,4,1
102,894,202210,51733,BANDA PRADO VIDAL RAYMUNDO,PEQ. EMPRESA,19651.35,1,1,0,2,0,0,1,8,1


In [23]:
datarisk['OUTLIER'].value_counts()

 1    151
-1     14
Name: OUTLIER, dtype: int64

In [24]:
datarisk.to_excel("ALERTAS_IF.xlsx")