## A. Configuraciones generales.

In [1]:
#1. Librerías.
%run "./librerias.ipynb"

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#2. Constantes.
%run "./constantes.ipynb"

In [None]:
#3. Funciones.
%run "./funciones.ipynb"

In [None]:
#4. Lectura de datos.
data = pd.read_csv(dataset_file_clase_ternaria)

In [None]:
data.shape

In [None]:
data.head()

## B. Data Quality.

#### 1. Valores nulos.

In [12]:
#a. Evaluamos.
train_data = data[data['foto_mes'].isin(mes_train_efma)]
score_data = data[data['foto_mes'] == mes_test]

train_null_percentage = train_data.isnull().mean() * 100
score_null_percentage = score_data.isnull().mean() * 100

comparison_df = pd.DataFrame({'Train Null Percentage': train_null_percentage, 'Score Null Percentage': score_null_percentage})
comparison_df['diff'] = (comparison_df['Score Null Percentage'] - comparison_df['Train Null Percentage']).abs()

comparison_df_sorted = comparison_df.sort_values('diff', ascending=False)

comparison_df_sorted

Unnamed: 0,Train Null Percentage,Score Null Percentage,diff
clase_ternaria,0.000000,100.000000,100.000000
Master_mpagospesos,60.126416,58.459691,1.666725
Master_mconsumototal,60.126416,58.459691,1.666725
Master_cconsumos,60.126416,58.459691,1.666725
Master_cadelantosefectivo,60.126416,58.459691,1.666725
...,...,...,...
mpayroll,0.000000,0.000000,0.000000
mpayroll2,0.000000,0.000000,0.000000
cpayroll2_trx,0.000000,0.000000,0.000000
ccuenta_debitos_automaticos,0.000000,0.000000,0.000000


In [13]:
#b. Decidimos.
print("No parece haber una diferencia significativa en la cantidad de valores nulos.")

No parece haber una diferencia significativa en la cantidad de valores nulos.


#### 2. Valores 0.

In [14]:
#a. Evaluamos.
train_zero_percentage = (train_data == 0).mean() * 100
score_zero_percentage = (score_data == 0).mean() * 100

comparison_df_zero = pd.DataFrame({'Train Zero Percentage': train_zero_percentage, 'Score Zero Percentage': score_zero_percentage})

comparison_df_zero['diff_zero_percentage'] = (comparison_df_zero['Score Zero Percentage'] - comparison_df_zero['Train Zero Percentage']).abs()
diff_zero_percentage_sorted = comparison_df_zero.sort_values('diff_zero_percentage',ascending=False)

diff_zero_percentage_sorted

Unnamed: 0,Train Zero Percentage,Score Zero Percentage,diff_zero_percentage
Master_fultimo_cierre,0.000000,68.891773,68.891773
Visa_fultimo_cierre,0.000000,68.829909,68.829909
cmobile_app_trx,29.815295,23.683859,6.131436
ctransferencias_recibidas,27.867398,24.303719,3.563678
mtransferencias_recibidas,27.867398,24.303719,3.563678
...,...,...,...
tcuentas,0.000000,0.000000,0.000000
cproductos,0.000000,0.000000,0.000000
cliente_antiguedad,0.000000,0.000000,0.000000
cliente_edad,0.000000,0.000000,0.000000


In [15]:
#b. Decidimos eliminar aquellas columnas con un gran aumento en la cantidad de 0 en el mes test.
data.drop(["Master_fultimo_cierre","Visa_fultimo_cierre"],axis=1,inplace=True)

train_data.drop(["Master_fultimo_cierre","Visa_fultimo_cierre"],axis=1,inplace=True)
score_data.drop(["Master_fultimo_cierre","Visa_fultimo_cierre"],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data.drop(["Master_fultimo_cierre","Visa_fultimo_cierre"],axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_data.drop(["Master_fultimo_cierre","Visa_fultimo_cierre"],axis=1,inplace=True)


#### 3. Valores <0.

In [16]:
#a. Evaluamos.
train_negative_percentage = (train_data == -1).mean() * 100
score_negative_percentage = (score_data == -1).mean() * 100

comparison_df_zero = pd.DataFrame({'Train Negative Percentage': train_negative_percentage, 'Score Negative Percentage': score_negative_percentage})

comparison_df_zero['diff_negative_percentage'] = (comparison_df_zero['Score Negative Percentage'] - comparison_df_zero['Train Negative Percentage']).abs()
diff_negative_percentage_sorted = comparison_df_zero.sort_values('diff_negative_percentage',ascending=False)

diff_negative_percentage_sorted

Unnamed: 0,Train Negative Percentage,Score Negative Percentage,diff_negative_percentage
Master_Finiciomora,0.003678,0.000000,0.003678
mactivos_margen,0.001533,0.003033,0.001500
Visa_mpagospesos,0.000000,0.001213,0.001213
Master_msaldototal,0.004598,0.003639,0.000959
Master_msaldopesos,0.004445,0.003639,0.000806
...,...,...,...
cpayroll2_trx,0.000000,0.000000,0.000000
ccuenta_debitos_automaticos,0.000000,0.000000,0.000000
mcuenta_debitos_automaticos,0.000000,0.000000,0.000000
ctarjeta_visa_debitos_automaticos,0.000000,0.000000,0.000000


In [17]:
#b. Decidimos.
print("No parece haber una diferencia significativa en la cantidad de valores con -1.")

No parece haber una diferencia significativa en la cantidad de valores con -1.


#### 4. Valores duplicados.

In [18]:
#a. Verifico.
print("Train tiene {} valores duplicados".format(train_data.duplicated().sum()))
print("Test tiene {} valores duplicados".format(score_data.duplicated().sum()))


Train tiene 0 valores duplicados
Test tiene 0 valores duplicados


## C. Data Drifting.

In [19]:
#1. Aplicamos PSI a casi todas las variables, excepto las de no interés.
psi_results = []
for column in train_data.columns:
  if column not in ['foto_mes','numero_de_cliente','clase_ternaria']:
    train_variable = train_data[column]
    score_variable = score_data[column]
    psi_value = psi(train_variable, score_variable)
    psi_results.append({'feature': column, 'psi': psi_value})

psi_df = pd.DataFrame(psi_results)
psi_df = psi_df.sort_values('psi', ascending=False)

  result = (actual_prop - expected_prop) * np.log(actual_prop / expected_prop)
  result = (actual_prop - expected_prop) * np.log(actual_prop / expected_prop)


In [20]:
#2. Observamos variables con PSI >= 0.1 (casos a atender).
psi_df[psi_df["psi"] > 0.1]

Unnamed: 0,feature,psi
112,Master_Finiciomora,inf
133,Visa_Finiciomora,inf
50,mpayroll,0.257969
49,cpayroll_trx,0.197012


In [21]:
#3. Analizo las variables a tener en cuenta.
#i. Master_Finiciomora y Visa_Finiciomora.
variable_name = 'Master_Finiciomora'
expected = train_data[variable_name]
actual = score_data[variable_name]

expected_not_null = expected.dropna()
actual_not_null = actual.dropna()

bin_edges = pd.qcut(expected_not_null, q=10, duplicates='drop').unique()
bin_edges2 = [edge.left for edge in bin_edges] + [edge.right for edge in bin_edges]
breakpoints = sorted(list(set(bin_edges2)))

print(f'Cortes en {variable_name}: {breakpoints}')
expected_counts, _ = np.histogram(expected_not_null, bins=breakpoints)
actual_counts, _ = np.histogram(actual_not_null, bins=breakpoints)

print(f'Frecuencia Esperada: {expected_counts}')
print(f'Frecuencia Actual: {actual_counts}')


Cortes en Master_Finiciomora: [-1.001, 9.0, 16.0, 18.0, 20.0, 23.0, 46.0, 51.0, 207.0]
Frecuencia Esperada: [250 253 614 546 712 294 351 378]
Frecuencia Actual: [ 73   0   0   0  25 320  11 234]


In [22]:
#ii. cpayroll_trx.
variable_name = 'cpayroll_trx'
expected = train_data[variable_name]
actual = score_data[variable_name]

expected_not_null = expected.dropna()
actual_not_null = actual.dropna()

bin_edges = pd.qcut(expected_not_null, q=20, duplicates='drop').unique()
bin_edges2 = [edge.left for edge in bin_edges] + [edge.right for edge in bin_edges]
breakpoints = sorted(list(set(bin_edges2)))

print(f'Cortes en {variable_name}: {breakpoints}')
expected_counts, _ = np.histogram(expected_not_null, bins=breakpoints)
actual_counts, _ = np.histogram(actual_not_null, bins=breakpoints)

print(f'Frecuencia Esperada: {expected_counts}')
print(f'Frecuencia Actual: {actual_counts}')


print("\n\n No voy a tomar ninguna acción dado que el árbol suele cortar entre 1 sueldo ó menos de 1 sueldo. No me cambia la distribución.")

Cortes en cpayroll_trx: [-0.001, 1.0, 2.0, 3.0, 251.0]
Frecuencia Esperada: [303288 204451  95166  49542]
Frecuencia Actual: [73677 26686 39300 25212]


 No voy a tomar ninguna acción dado que el árbol suele cortar entre 1 sueldo ó menos de 1 sueldo. No me cambia la distribución.


## D. Ajustes por inflación.

In [23]:
#1. Valores financieros y meses
#i. Listas.
vfoto_mes = [202101, 202102, 202103, 202104, 202105, 202106]
vIPC = [0.9680542110, 0.9344152616, 0.8882274350, 0.8532444140, 0.8251880213, 0.8003763543]
vdolar_blue = [157.900000, 149.380952, 143.615385, 146.250000, 153.550000, 162.000000]
vdolar_oficial = [91.474000, 93.997778, 96.635909, 98.526000, 99.613158, 100.619048]
vUVA = [0.9669867858358365, 0.9323750098728378, 0.8958202912590305, 0.8631993702994263, 0.8253893405524657, 0.7928918905364516]
#ii. Lo junto en un dataframe.
tb_indices = pd.DataFrame({
    'IPC': vIPC,
    'dolar_blue': vdolar_blue,
    'dolar_oficial': vdolar_oficial,
    'UVA': vUVA,
    'foto_mes': vfoto_mes
})

In [31]:
#2. Defino los campos que voy a ajustar por inflación.
campos_monetarios = [col for col in data.columns if col.startswith(('m', 'Visa_m', 'Master_m', 'vm_m'))]


In [34]:
#3. Diferentes funciones y métodos para corregir el efecto de la inflación.
def drift_uva(dataset, campos_monetarios, tb_indices):
    print("inicio drift_UVA()")
    dataset = dataset.merge(tb_indices[['foto_mes', 'UVA']], on='foto_mes', how='left')
    for campo in campos_monetarios:
        dataset[campo] *= dataset['UVA']
    dataset.drop(columns=['UVA'], inplace=True)
    print("fin drift_UVA()")

def drift_deflacion(dataset, campos_monetarios, tb_indices):
    print("inicio drift_deflacion()")
    dataset = dataset.merge(tb_indices[['foto_mes', 'IPC']], on='foto_mes', how='left')
    for campo in campos_monetarios:
        dataset[campo] *= dataset['IPC']
    dataset.drop(columns=['IPC'], inplace=True)
    print("fin drift_deflacion()")

# Función para estandarizar datos
def drift_estandarizar(dataset, campos_drift):
    print("inicio drift_estandarizar()")
    for campo in campos_drift:
        dataset[campo + "_normal"] = dataset.groupby('foto_mes')[campo].transform(lambda x: (x - x.mean()) / x.std())
        dataset.drop(columns=[campo], inplace=True)
    print("fin drift_estandarizar()")

In [35]:
#4. Le aplico la inflación a todas ellas.
data = drift_deflacion(data, campos_monetarios, tb_indices)

inicio drift_deflacion()
fin drift_deflacion()


## E. Exportación.

In [39]:
#i. Exportación propiamente dicha.
data.to_csv(dataset_file_preprocesado,index=False)