# 1. Importación de Librerías

In [None]:
# Aca se importan las librerias que vas a usar
import sys
import os
directory_path = os.path.abspath(os.path.join('..'))
utils_path = os.path.abspath(os.path.join('../utils'))
if directory_path not in sys.path:
    sys.path.append(directory_path)
    sys.path.append(utils_path)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import math
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
from sklearn.ensemble import IsolationForest
from scipy import stats
#from utils.Validator import *
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

# 2. Importación de data

In [None]:
#Seleccionar ruta de la data
df = pd.read_csv(r"../data/modeling/01_raw.csv",sep="|")   
df.head()

In [None]:
df.columns

In [None]:
df0 = df.copy()
df = df[df.columns]

#  3. Descriptivo de la información

In [None]:
#Número de registros por mes
df.groupby("periodo").size()

In [None]:
# Selecciona algunos meses
df1 = df[(df["periodo"]<=202304) & (df["periodo"]>=202210)].copy()
df1.shape

In [None]:
# Se crea una copia
df2=df1.copy()

In [None]:
# Tratamiento de nulos
df2.isnull().sum()

In [None]:
#Seleccion de variables para correlación
dfcorr=df2[['monto_total', 'ctd_trx', 'ctd_ordenantes','flg_extranjero', 'FLG_PERFIL', 'flg_cliente', 'monto_total_semanal', 'porcentaje_monto', 'ctd_trx_semanal',
       'porcentaje_ctd']]

In [None]:
plt.figure(figsize=[8,8])
corr = dfcorr.corr()
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns,
        vmin=-1, vmax=1,
        annot=True)
plt.show()

In [None]:
#Guarda las correlaciones en csv
corr.to_csv('Matriz_correlaciones.csv')

In [None]:
# Diagrama de cajas monto_total vs periodo
fig = plt.figure(figsize=(8, 8))
sns.boxplot(data=df2,y="monto_total",x="periodo",
            showmeans=True,
            meanprops={"marker":"o",
                       "markerfacecolor":"white", 
                       "markeredgecolor":"black",
                       "markersize":"10"})
plt.show()

In [None]:
# Diagrama de cajas ctd_trx vs periodo
fig = plt.figure(figsize=(8, 8))
sns.boxplot(data=df2,y="ctd_trx",x="periodo",
            showmeans=True,
            meanprops={"marker":"o",
                       "markerfacecolor":"white", 
                       "markeredgecolor":"black",
                       "markersize":"10"})
plt.show()

In [None]:
#Observamos los outlier respecto a la Cantidad de Trx
df2[df2['ctd_trx']>30]

In [None]:
#Observamos los outlier respecto al Monto Total
df2[df2['monto_total']>20000]

In [None]:
# Histograma MTO_TOTAL
df2.hist('monto_total')

In [None]:
# Histograma CTD_TOTAL
df2.hist('ctd_trx')

In [None]:
#Descriptivo de la información
df2.describe()

# 4. Estabilidad de Variables

# Variables Numéricas

In [None]:
cols_num=['monto_total', 'ctd_trx', 'ctd_ordenantes', 'monto_total_semanal', 'porcentaje_monto', 'ctd_trx_semanal', 'porcentaje_ctd']

In [None]:
df_list = []

for col in cols_num:

    df_tmp = pd.crosstab(pd.qcut(df2[col],3,duplicates='drop'),df2['periodo'],normalize='columns')

    df_tmp["VARIABLE"] = col

    df_tmp["RANGO"] = df_tmp.index

    df_list.append(df_tmp)

In [None]:
df_stability = pd.concat(df_list)
df_stability.head(20)

In [None]:
df_stability_2 = pd.melt(df_stability,id_vars=['VARIABLE','RANGO'],var_name='periodo',value_name='PERCENT')
df_stability_2.head(20)

# Variables Categóricas

In [None]:
cols_cat=[ 'FLG_PERFIL', 'flg_cliente']

In [None]:
df_list = []

for col in cols_cat:

    df_tmp = pd.crosstab(df2[col],df2['periodo'],normalize='columns')

    df_tmp["VARIABLE"] = col

    df_tmp["RANGO"] = df_tmp.index

    df_list.append(df_tmp)  

In [None]:
df_stability = pd.concat(df_list)
df_stability.head(100)

In [None]:
df_stability_3 = pd.melt(df_stability,id_vars=['VARIABLE','RANGO'],var_name='periodo',value_name='PERCENT')
df_stability_3.head(100)

In [None]:
df_stability_4=pd.concat([df_stability_2,df_stability_3])
df_stability_4.head(200)

In [None]:
#Guarda la estabilidad
df_stability_4.to_excel(r"../data/modeling/stability.xlsx",index=False)