In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

In [2]:
DATA_DIR = Path.cwd().resolve().parent / "datos"

datos_titanic = pd.read_parquet(DATA_DIR / "02_datos_con_tipo_de_dato_ajustado_titanic.parquet")   

# Selección de columnas y estructura del dataframe

In [3]:
datos_titanic.columns

Index(['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked'],
      dtype='object')

In [4]:
datos_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   pclass    1309 non-null   int64   
 1   survived  1309 non-null   bool    
 2   sex       1309 non-null   category
 3   age       1046 non-null   float64 
 4   sibsp     1309 non-null   int64   
 5   parch     1309 non-null   int64   
 6   fare      1308 non-null   float64 
 7   embarked  1307 non-null   category
dtypes: bool(1), category(2), float64(2), int64(3)
memory usage: 55.3 KB


In [5]:
from pycaret.classification import setup, compare_models

In [6]:
target_col = "survived"

In [9]:
clf_setup = setup(
    data = datos_titanic,
    target = target_col,
    session_id =  42,

    # Parámetros para controlar el preprocesamiento:
    numeric_features = ['age', 'fare', 'sibsp', 'parch'],
    categorical_features = ['sex', 'embarked'],
    ordinal_features = {'pclass': [1,2,3]},

    # Estrategias de imputacion
    numeric_imputation = 'mean',
    categorical_imputation = 'mode',
)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,survived
2,Target type,Binary
3,Original data shape,"(1309, 8)"
4,Transformed data shape,"(1309, 10)"
5,Transformed train set shape,"(916, 10)"
6,Transformed test set shape,"(393, 10)"
7,Ordinal features,1
8,Numeric features,4
9,Categorical features,2


In [10]:
datos_titanic['survived'].value_counts()

False    809
True     500
Name: survived, dtype: int64

In [11]:
best_model = compare_models(
    include=['lr', 'rf', 'gbc', 'lightgbm'],
    sort='Accuracy',
    n_select=4
)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.7957,0.8295,0.7143,0.7437,0.7267,0.5641,0.5663,0.496
rf,Random Forest Classifier,0.7935,0.8327,0.7057,0.7447,0.7221,0.5584,0.5613,0.154
gbc,Gradient Boosting Classifier,0.7924,0.8467,0.6571,0.7645,0.7057,0.5473,0.5517,0.01
lr,Logistic Regression,0.7739,0.8243,0.6686,0.7209,0.6906,0.5137,0.5171,0.842


In [12]:
from pycaret.classification import create_model, tune_model, predict_model

print("Creando el modelo lightgbm...")
gbc_model = create_model('lightgbm')

print("\n")
print("------")
print("\n")

print("Realizando fine-tuning del modelo....")
gbc_tuned = tune_model(gbc_model)

print("\n")
print("------")
print("\n")

print("Resumen del modelo")
print(gbc_tuned)

print("\n")
print("------")
print("\n")

print("Predicciones en el conjunto/set de test")
predicciones_internas = predict_model(gbc_tuned)

print("\n")
print("------")
print("\n")


print(predicciones_internas.head())

Creando el modelo lightgbm...


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8043,0.8155,0.7143,0.7576,0.7353,0.5803,0.581
1,0.7609,0.8591,0.7714,0.6585,0.7105,0.509,0.5136
2,0.7935,0.8667,0.6571,0.7667,0.7077,0.5495,0.5534
3,0.8696,0.9118,0.8286,0.8286,0.8286,0.7233,0.7233
4,0.8261,0.8774,0.8286,0.7436,0.7838,0.639,0.6417
5,0.8261,0.8837,0.7143,0.8065,0.7576,0.6228,0.6255
6,0.8132,0.85,0.7429,0.7647,0.7536,0.6032,0.6034
7,0.7363,0.7505,0.5714,0.6897,0.625,0.4244,0.4288
8,0.7253,0.7439,0.6286,0.6471,0.6377,0.4165,0.4166
9,0.8022,0.736,0.6857,0.7742,0.7273,0.573,0.5756




------


Realizando fine-tuning del modelo....


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7826,0.8188,0.6286,0.7586,0.6875,0.5231,0.5285
1,0.8152,0.8932,0.7429,0.7647,0.7536,0.6058,0.606
2,0.8261,0.8702,0.7429,0.7879,0.7647,0.627,0.6276
3,0.8587,0.9381,0.8,0.8235,0.8116,0.6986,0.6988
4,0.8478,0.9108,0.8286,0.7838,0.8056,0.6807,0.6814
5,0.8261,0.8732,0.6857,0.8276,0.75,0.6185,0.6249
6,0.8022,0.8561,0.6857,0.7742,0.7273,0.573,0.5756
7,0.7143,0.8092,0.4857,0.68,0.5667,0.3623,0.3737
8,0.7253,0.7837,0.6,0.6562,0.6269,0.4102,0.4112
9,0.7582,0.7543,0.5429,0.76,0.6333,0.4604,0.4749


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


------


Resumen del modelo
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=42, reg_alpha=0.0, reg_lambda=0.0, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)


------


Predicciones en el conjunto/set de test


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.799,0.8578,0.74,0.7351,0.7375,0.5747,0.5747




------


      pclass     sex   age  sibsp  parch       fare embarked  survived  \
1289       3    male  21.0      1      0   6.495800        S     False   
352        2  female  40.0      1      1  39.000000        S      True   
1189       3  female   4.0      1      1  16.700001        S      True   
78         1  female  64.0      0      2  83.158302        C      True   
974        3    male  30.0      1      0  16.100000        S     False   

      prediction_label  prediction_score  
1289                 0            0.8640  
352                  1            0.9968  
1189                 1            0.7978  
78                   1            0.9938  
974                  0            0.9308  


In [13]:
print(gbc_tuned.get_params())

{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 42, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0}


In [14]:
np.random.seed(57)


n_samples = 50

df_synthtic_test = pd.DataFrame({
    'pclass': np.random.choice([1, 2, 3], size=n_samples),
    'sex': np.random.choice(['male', 'female'], size=n_samples),
    'age': np.random.uniform(0, 80, size=n_samples),
    'sibsp': np.random.randint(0, 10, size=n_samples),
    'parch': np.random.randint(0, 4, size=n_samples),
    'fare': np.random.uniform(10, 2000, size=n_samples),
    'embarked': np.random.choice(['C', 'Q', 'S'], size=n_samples)
})

df_synthtic_test.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
0,3,male,29.549676,6,0,1535.239303,Q
1,2,male,22.782733,6,2,913.722493,C
2,3,male,74.169372,3,3,277.143499,C
3,1,male,62.041734,4,0,527.476471,Q
4,3,male,51.305236,2,1,717.351105,S


In [16]:
prediccines_sinteticas = predict_model(gbc_tuned, data=df_synthtic_test)

prediccines_sinteticas.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,prediction_label,prediction_score
0,3,male,29.549675,6,0,1535.239258,Q,0,0.9823
1,2,male,22.782732,6,2,913.722473,C,0,0.9756
2,3,male,74.169373,3,3,277.143494,C,0,0.9869
3,1,male,62.041733,4,0,527.476501,Q,0,0.9802
4,3,male,51.305237,2,1,717.351074,S,0,0.9744


# Guardar el modelo

In [17]:
from pycaret.classification import save_model

In [18]:
MODELS_DIR = Path.cwd().resolve().parent / "modelos"
MODELS_DIR.mkdir(exist_ok=True)

model_path = MODELS_DIR / "lightgbm_tuned_model"

save_model(gbc_tuned, model_path)

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['age', 'fare', 'sibsp', 'parch'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('categorical_imputer',
                  TransformerWrapper(exclude=None, include=['sex', 'embark...
                  LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                 colsample_bytree=1.0, importance_type='split',
                       