In [1]:
# Add propension/src to sys path to import functions easily
import sys
from pathlib import Path

src_path = str(Path.cwd().parent / 'src')
sys.path.append(src_path)

In [2]:
# Import utils
from utilities import save_metrics, save_model_artifact, save_training_dataset_metadata
from utilities import read_datasets, split_xy, indices_in_list
from utilities.ml_pipeline import build_ml_pipeline, fit_ml_pipeline, evaluate_model, calculate_feature_importance

In [3]:
# Import constants
from feature_constants import NUM_COLS, OHE_COLS
from feature_constants import AGE_COL, NSE_COL, RCC_COL, DES_CONO_COL, COMBO_COL
from feature_constants import NSE_CATEGORY_ORDER, RCC_CATEGORY_ORDER, LIMA_PROV_CATEGORY_ORDER

In [4]:
# Import trasnformers
from transformers import create_model, build_preprocessing_transformer

In [5]:
# Declare arguments
TRAIN_PATH = "gs://ue4_ndlk_nonprod_stg_gcs_iadev_artfsto/tmp/score_venta_ami/propension/datasets/train"
VALID_PATH = "gs://ue4_ndlk_nonprod_stg_gcs_iadev_artfsto/tmp/score_venta_ami/propension/datasets/valid"
TEST_PATH = "gs://ue4_ndlk_nonprod_stg_gcs_iadev_artfsto/tmp/score_venta_ami/propension/datasets/test"
HPARAMS = dict(
    n_estimators=1500,
    objective="binary",
    learning_rate=0.01,
    max_depth=3
)
LABEL ="per_propensionvn_ami_cls"

In [6]:
# Read dataframes
df_train, df_valid, df_test = read_datasets(TRAIN_PATH, VALID_PATH, TEST_PATH)

  df_train = pd.read_csv(train_path)


In [7]:
# Split dataframes
X_train, y_train = split_xy(df_train, LABEL)
X_valid, y_valid = split_xy(df_valid, LABEL)
X_test, y_test = split_xy(df_test, LABEL)

In [8]:
# List of all columns in train dataset
col_list = X_train.columns.tolist()

In [9]:
# Columns indices and categories custom order to build preprocessing pipeline
columns_indices = {
    "numerical": indices_in_list(NUM_COLS, col_list),
    "age": indices_in_list(AGE_COL, col_list),
    "nse": indices_in_list(NSE_COL, col_list),
    "rcc": indices_in_list(RCC_COL, col_list),
    "cono_agrup": indices_in_list(DES_CONO_COL, col_list),
    "lima_prov": indices_in_list(OHE_COLS, col_list),
    "products": indices_in_list(COMBO_COL, col_list)
}

categories_order = {
    "nse": [NSE_CATEGORY_ORDER],
    "rcc": [RCC_CATEGORY_ORDER],
    "lima_prov": [LIMA_PROV_CATEGORY_ORDER]
}

In [10]:
columns_indices

{'numerical': [2, 3, 4, 5, 6, 7, 8, 10, 11, 14, 16, 17, 18, 19],
 'age': [13],
 'nse': [12],
 'rcc': [15],
 'cono_agrup': [21],
 'lima_prov': [20],
 'products': [9]}

In [11]:
categories_order

{'nse': [['OTRO', 'E', 'D', 'C2', 'C1', 'B2', 'B1', 'A2', 'A1']],
 'rcc': [['OTRO',
   'PERDIDA',
   'DUDOSO',
   'DEFICIENTE',
   'CON PROBLEMAS POTENCIALES',
   'NORMAL']],
 'lima_prov': [['OTRO', 'LIMA-CALLAO', 'PROVINCIA']]}

In [12]:
preprocesser = build_preprocessing_transformer(columns_indices, categories_order)

In [13]:
preprocesser

In [14]:
lgb_model = create_model(HPARAMS)

In [15]:
lgb_model

In [16]:
pipeline = build_ml_pipeline(preprocesser, lgb_model)

In [17]:
pipeline

In [18]:
X_train_transformed = pipeline["feature_engineering"].fit_transform(X_train)

In [19]:
X_train['des_combo_productos'].value_counts()

des_combo_productos
DESGRAVAMEN TC BBVA CREDITO                                                                                                                                                                                                                           7209
EPS                                                                                                                                                                                                                                                   6155
SOAT                                                                                                                                                                                                                                                  4246
ROBO TARJETA BBVA                                                                                                                                                                                                                  

In [20]:
X_train_transformed

array([[ 0.,  0.,  0., ...,  0.,  0., nan],
       [ 0.,  0.,  0., ...,  0.,  0., nan],
       [ 0.,  0.,  0., ...,  0.,  0., nan],
       ...,
       [ 1., 10.,  1., ...,  0.,  1.,  1.],
       [ 1., 11.,  1., ...,  1.,  0.,  1.],
       [ 1.,  8.,  0., ...,  0.,  1.,  0.]])

In [21]:
import numpy as np

np.unique(X_train_transformed[:, -1], return_counts=True)

(array([ 0.,  1., nan]), array([ 38086,  29904, 126415]))

In [22]:
fit_ml_pipeline(pipeline, X_train, y_train, X_valid, y_valid, "auc")

Training until validation scores don't improve for 50 rounds
[100]	train's auc: 0.716285	train's binary_logloss: 0.179551	valid's auc: 0.703099	valid's binary_logloss: 0.179304
[200]	train's auc: 0.721391	train's binary_logloss: 0.176695	valid's auc: 0.713379	valid's binary_logloss: 0.176508
[300]	train's auc: 0.724592	train's binary_logloss: 0.175516	valid's auc: 0.719081	valid's binary_logloss: 0.175143
[400]	train's auc: 0.726919	train's binary_logloss: 0.174903	valid's auc: 0.723412	valid's binary_logloss: 0.174411
[500]	train's auc: 0.728871	train's binary_logloss: 0.174496	valid's auc: 0.725538	valid's binary_logloss: 0.174036
[600]	train's auc: 0.730491	train's binary_logloss: 0.17418	valid's auc: 0.727995	valid's binary_logloss: 0.173677
[700]	train's auc: 0.73186	train's binary_logloss: 0.173922	valid's auc: 0.729504	valid's binary_logloss: 0.173462
[800]	train's auc: 0.733049	train's binary_logloss: 0.173702	valid's auc: 0.730638	valid's binary_logloss: 0.173321
[900]	train's

In [23]:
pipeline["feature_engineering"].get_feature_names_out()

array(['impute_numerical__ind_cliente', 'impute_numerical__ctd_productos',
       'impute_numerical__flg_eps', 'impute_numerical__flg_rrgg',
       'impute_numerical__flg_vida', 'impute_numerical__flg_vehi',
       'impute_numerical__mto_prima_contable_usd',
       'impute_numerical__flg_fue_cliente_rimac',
       'impute_numerical__val_scoring_ingreso',
       'impute_numerical__mto_max_linea_tc',
       'impute_numerical__mto_saldo_tc_sbs',
       'impute_numerical__mto_saldo_sbs',
       'impute_numerical__flg_tiene_vehiculo',
       'impute_numerical__flg_escliente', 'impute_age__num_edad',
       'nse_encoding__nse', 'rcc_encoding__cal_gral',
       'cono_agrup_encoding__des_cono_agrup_nuevo',
       'lima_prov_encoding__des_lima_prov_LIMA-CALLAO',
       'lima_prov_encoding__des_lima_prov_PROVINCIA',
       'flag_desgravamen__des_combo_productos_DESGRAVAMEN'], dtype=object)

In [24]:
metrics = evaluate_model(pipeline, X_test, y_test)

In [25]:
metrics

{'problemType': 'classification',
 'auPrc': 0.14596800697155515,
 'auRoc': 0.7786845331819002,
 'logLoss': 0.1411081113213628}

In [26]:
feature_importance = calculate_feature_importance(pipeline)

In [27]:
feature_importance

{'num_edad': 2317,
 'val_scoring_ingreso': 1569,
 'mto_prima_contable_usd': 829,
 'des_cono_agrup_nuevo': 740,
 'mto_saldo_sbs': 687,
 'mto_max_linea_tc': 639,
 'mto_saldo_tc_sbs': 617,
 'nse': 597,
 'ctd_productos': 162,
 'flg_fue_cliente_rimac': 130,
 'flg_tiene_vehiculo': 122,
 'ind_cliente': 97,
 'flg_rrgg': 77,
 'flg_eps': 70,
 'des_combo_productos_DESGRAVAMEN': 66,
 'flg_vehi': 65,
 'cal_gral': 54,
 'flg_vida': 51,
 'des_lima_prov_LIMA-CALLAO': 34,
 'flg_escliente': 0,
 'des_lima_prov_PROVINCIA': 0}

In [None]:
import lightgbm as lgb

lgb.plot_importance(pipeline['train_model'])

In [None]:
lgb.plot_metric(pipeline['train_model'], metric='auc')

In [None]:
# Create path to save model and metrics
model_artifact_path = Path.cwd().parent / "model_artifacts"
metrics_filename = model_artifact_path / "metrics"

In [None]:
save_model_artifact(model_artifact=pipeline, model_path=str(model_artifact_path))

In [None]:
save_metrics(metrics, str(metrics_filename))