In [1]:
import os
from pysiens.mlflow import get_token, log_new_run
from pydataml.gcp import files_in_bucket

import numpy as np
import pandas as pd

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn import set_config

In [2]:
os.environ['MLFLOW_TARGET_AUDIENCE'] = '420108713496-qhfjpmgtljrfn5k2pitnb2bnc44mqk5r.apps.googleusercontent.com'
os.environ['MLFLOW_SA'] = 'sa-mlflow@uala-dataml.iam.gserviceaccount.com'
os.environ['MLFLOW_TRACKING_URI'] = 'https://mlflow.datascience.ar.ua.la/'


In [3]:
set_config(transform_output = 'pandas')

BUCKET = 'uala-dataml-templates'
SEED = 42

In [4]:
df_raw = pd.read_parquet(
    'gs://uala-arg-dataml-ualascore-cuotis-lab/enfoque_no_paccs/data/06_test_research/training_dataset.parquet', 
    engine='fastparquet')

df = (
    df_raw
    .sort_values(['id_account', 'loan_approved_date'])
    .set_index('id_account')
    .drop(columns='loan_approved_date')
    .replace({'nan' : 'None'})
)

In [5]:
cat_cols = list(df.select_dtypes(['category', 'object']).columns)

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    df.drop(columns='mora'), 
    df['mora'], 
    test_size=.2, 
    stratify=df['mora'], 
    random_state=SEED
)

In [6]:
def read_best_params(
    csv_path: str, #Path del csv, típicamente en GCS
    ret_hpo_df: bool = False, #Si devolver el DF final de Ray
    metric: str = 'neg_log_loss', #Métrica utilizada en la optimización 
) -> dict:
    """
    Devuelve un diccionario con los HP con mejor métrica.
    Opcionalmente también el dataframe guardado.
    """
    hpo_df = pd.read_csv(csv_path)
    best_params = hpo_df.loc[hpo_df[metric].idxmax()].to_dict()
    del best_params[metric]
    del best_params['trial_id']
    if ret_hpo_df:
        return best_params, hpo_df
    return best_params

In [7]:
fib = files_in_bucket(
    project='uala-dataml',
    bucket_name=BUCKET,
    extension='csv',
    return_type='list'
    )

In [8]:
best_params, hpo_df = read_best_params(
    csv_path=f'gs://{BUCKET}/{fib[-1]}', 
    ret_hpo_df=True,
    )

In [9]:
cb = CatBoostClassifier(
        cat_features=cat_cols,
        random_seed=SEED,
        verbose=0,
        thread_count=-1,
    )

cb.set_params(**best_params)

<catboost.core.CatBoostClassifier at 0x1100ae970>

In [10]:
pipe = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='None'),
    SimpleImputer(missing_values=pd.NA, strategy='constant', fill_value='None'),
)

prepro = make_column_transformer(
    (pipe, cat_cols),
    remainder='passthrough',
    verbose_feature_names_out=False,
)

full_pipeline = Pipeline([
    ('preprocessor', prepro), 
    ('classifier', cb)
    ])
full_pipeline

In [11]:
full_pipeline.fit(X=X_train_raw, y=y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [12]:
y_pred_proba = full_pipeline.predict_proba(X_test_raw)
loss = log_loss(y_test, y_pred_proba)
print(f'Log Loss: {loss:.3f}')

Log Loss: 0.285


Save mlflow

In [13]:
get_token()

Auth with impersonate sa: sa-mlflow@uala-dataml.iam.gserviceaccount.com


'eyJhbGciOiJSUzI1NiIsImtpZCI6ImQyZDQ0NGNmOGM1ZTNhZTgzODZkNjZhMTNhMzE2OTc2YWEzNjk5OTEiLCJ0eXAiOiJKV1QifQ.eyJhdWQiOiI0MjAxMDg3MTM0OTYtcWhmanBtZ3RsanJmbjVrMnBpdG5iMmJuYzQ0bXFrNXIuYXBwcy5nb29nbGV1c2VyY29udGVudC5jb20iLCJhenAiOiIxMDUzODYzMjY2MDQ4NDI3MjY5MDIiLCJlbWFpbCI6InNhLW1sZmxvd0B1YWxhLWRhdGFtbC5pYW0uZ3NlcnZpY2VhY2NvdW50LmNvbSIsImVtYWlsX3ZlcmlmaWVkIjp0cnVlLCJleHAiOjE3MjQzMzc5OTgsImlhdCI6MTcyNDMzNDM5OCwiaXNzIjoiaHR0cHM6Ly9hY2NvdW50cy5nb29nbGUuY29tIiwic3ViIjoiMTA1Mzg2MzI2NjA0ODQyNzI2OTAyIn0.V00N0YvOwUSkWOLCrBgdGL-3fBzdrprlKlkRcdNeu1gF3iARu0dZ0QsC6fmiyE4p57GALycROD5sN7Am_tLMZSJJZaStKlAV_linbc4or466FB0x1YI2HgaUwF88h6NCHVP5QolTQKzCuWzT2cykFBK1sEIT5dwknj_suHgZsbY6JinhAElMjBFAuw40XT_Ag4OKAaZIsQSAG2DeqXp2QGyOGj61HYTXtFRlsifRkvf2_bfFMe03NiMsYOKV97tbG9txI_jvwl74_PN1XpVHm5xEsS7gF9KSjDQA3OHZ8x56OST7bnSkb4e8mgSnVxPbN1VLnwu-pv3IoEKEY1oYZg'

In [14]:
run_id = log_new_run(
    experiment_id=0,
    run_name='Creditos test',
    run_description='Creditos test',
    model_artifact=full_pipeline,
    metrics={'log_loss': round(loss, 3)},
    tag_tipo_modelo='clasificacion',
    tag_tipo_corrida='test',
    tag_con_modelo=True,
    tag_tuning=True,
    tag_training_dataset=False,
    tag_poblacion='v1',
    tag_calibrated=False,
    tag_features='',
    tag_prob_threshold=.5,
    tag_target='mora',
    partition='test',
    # hyperparameters=full_pipeline[-1].get_params()
)



INVALID_PARAMETER_VALUE: Param value '['ar_user_demographics__occupation', 'ar_user_demographics__marital_status', 'ar_user_demographics__province_address', 'ar_user_demographics__provider', 'ar_user_demographics__province_delivery', 'ar_user_demographics__region_address', 'ar_amplitude_' had length 390, which exceeded length limit of 250
