In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

In [2]:
DATA_DIR = Path.cwd().resolve().parent / "datos"

datos_titanic = pd.read_parquet(DATA_DIR / "02_datos_con_tipo_de_dato_ajustado_titanic.parquet")   

In [3]:
cols_categorics = ['sex', 'embarked']

cols_numerics = ['age', 'fare', 'sibsp', 'parch']

cols_categorics_ord= ['pclass']

# Importar herramientas de preprocesamiento

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

Vamos a crear dos pipelines de preprocesamiento para evaluar el impacto de dos enfoques diferentes. Estos son:

1- Enfoque 1: imputar con la mediana y NO escalar

2- Enfoque 2: imputar con la media y USAR escalado

In [6]:
numeric_pipeline_median = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    #('scaler', StandardScaler())  # No escalamos en este enfoque
])

numeric_pipeline_mean_scaled = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())  # Escalamos en este enfoque
])

categorical_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

categorical_ord_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

In [7]:
preprocessor_median = ColumnTransformer(transformers=[
    ('num', numeric_pipeline_median, cols_numerics),
    ('cat', categorical_pipe, cols_categorics),
    ('cat_ord', categorical_ord_pipe, cols_categorics_ord)
])

preprocessor_median

In [8]:
preprocessor_mean_scale = ColumnTransformer(transformers=[
    ('num', numeric_pipeline_mean_scaled, cols_numerics),
    ('cat', categorical_pipe, cols_categorics),
    ('cat_ord', categorical_ord_pipe, cols_categorics_ord)
])

preprocessor_mean_scale

# Dividir train/test

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_features = datos_titanic.drop(columns=['survived'])
y_target = datos_titanic['survived']

X_train, X_test, y_train, y_test = train_test_split(
    X_features, 
    y_target, 
    test_size=0.2, 
    random_state=42
)   

In [12]:
preprocessor_mean_scale.fit(X_test)

In [13]:
feature_names = preprocessor_mean_scale.get_feature_names_out()

x_test_transformed = preprocessor_mean_scale.transform(X_test)

In [15]:
feature_names

array(['num__age', 'num__fare', 'num__sibsp', 'num__parch',
       'cat__sex_female', 'cat__sex_male', 'cat__embarked_C',
       'cat__embarked_Q', 'cat__embarked_S', 'cat_ord__pclass'],
      dtype=object)

In [14]:
x_test_transformed

array([[ 2.79613739e-01, -4.88249555e-01, -4.11053668e-01, ...,
         0.00000000e+00,  1.00000000e+00,  2.00000000e+00],
       [-8.54501103e-01, -3.21072410e-01,  5.86132082e-01, ...,
         0.00000000e+00,  0.00000000e+00,  2.00000000e+00],
       [ 2.68612354e-16, -4.73294857e-01, -4.11053668e-01, ...,
         0.00000000e+00,  1.00000000e+00,  2.00000000e+00],
       ...,
       [-7.78893447e-01, -4.78063753e-01, -4.11053668e-01, ...,
         0.00000000e+00,  1.00000000e+00,  2.00000000e+00],
       [ 1.41372858e+00,  3.47693769e+00,  5.86132082e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-5.52070478e-01,  3.33971671e-01, -4.11053668e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [16]:
x_test_tranformed = pd.DataFrame(x_test_transformed, columns=feature_names)

x_test_tranformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   num__age         262 non-null    float64
 1   num__fare        262 non-null    float64
 2   num__sibsp       262 non-null    float64
 3   num__parch       262 non-null    float64
 4   cat__sex_female  262 non-null    float64
 5   cat__sex_male    262 non-null    float64
 6   cat__embarked_C  262 non-null    float64
 7   cat__embarked_Q  262 non-null    float64
 8   cat__embarked_S  262 non-null    float64
 9   cat_ord__pclass  262 non-null    float64
dtypes: float64(10)
memory usage: 20.6 KB


In [18]:
x_test_tranformed

Unnamed: 0,num__age,num__fare,num__sibsp,num__parch,cat__sex_female,cat__sex_male,cat__embarked_C,cat__embarked_Q,cat__embarked_S,cat_ord__pclass
0,2.796137e-01,-0.488250,-0.411054,-0.464832,0.0,1.0,0.0,0.0,1.0,2.0
1,-8.545011e-01,-0.321072,0.586132,0.888346,0.0,1.0,1.0,0.0,0.0,2.0
2,2.686124e-16,-0.473295,-0.411054,-0.464832,0.0,1.0,0.0,0.0,1.0,2.0
3,2.686124e-16,-0.470303,-0.411054,-0.464832,0.0,1.0,0.0,0.0,1.0,2.0
4,2.686124e-16,-0.476124,-0.411054,-0.464832,0.0,1.0,0.0,1.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...
257,-2.496399e-01,-0.381056,-0.411054,-0.464832,1.0,0.0,0.0,0.0,1.0,1.0
258,-6.276781e-01,0.602762,-0.411054,0.888346,0.0,1.0,1.0,0.0,0.0,0.0
259,-7.788934e-01,-0.478064,-0.411054,-0.464832,1.0,0.0,0.0,0.0,1.0,2.0
260,1.413729e+00,3.476938,0.586132,0.888346,0.0,1.0,1.0,0.0,0.0,0.0


# Modelos

* Regresión logística
* Random forest - Bosques aleatorios

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [26]:
def resumen_clasificacion(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
        }

In [41]:
modelos = {
    'Regresión Logística': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

In [42]:
pipelines = {}

for modelo_nombre, modelo in modelos.items():
    # Pipeline con imputación de la median - sin escalado
    pipelines[f"{modelo_nombre}_median"] = Pipeline(steps=[
        ('preprocessor', preprocessor_median),
        ('classifier', modelo)
    ])

    # Pipeline con imputación de la media y escalado
    pipelines[f"{modelo_nombre}_mean_scaled"] = Pipeline(steps=[
        ('preprocessor', preprocessor_mean_scale),
        ('classifier', modelo)
    ])

In [43]:
pipelines

{'Regresión Logística_median': Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median'))]),
                                                   ['age', 'fare', 'sibsp',
                                                    'parch']),
                                                  ('cat',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='most_frequent')),
                                                                   ('onehot',
                                                                    OneHotEncoder(handle_unknown='ignore'))]),
                                                   ['sex', 'embarked']),
                                              

# Entrenamiento y evaluación de cada pipeline

In [44]:
resultados = {}

for nombre_pipeline, pipeline in pipelines.items():
    # Entrenamiento
    pipeline.fit(X_train, y_train)
    
    # Predicción
    y_pred = pipeline.predict(X_test)
    
    # Guardar resultados
    resultados[nombre_pipeline] = resumen_clasificacion(y_test, y_pred)

In [45]:
import pprint

print("Resumen de las métricas para casa Pipeline:")

pprint.pprint(resultados)

Resumen de las métricas para casa Pipeline:
{'Random Forest_mean_scaled': {'Accuracy': 0.7938931297709924,
                               'F1 Score': 0.75,
                               'Precision': 0.826530612244898,
                               'Recall': 0.6864406779661016},
 'Random Forest_median': {'Accuracy': 0.7824427480916031,
                          'F1 Score': 0.7348837209302326,
                          'Precision': 0.8144329896907216,
                          'Recall': 0.6694915254237288},
 'Regresión Logística_mean_scaled': {'Accuracy': 0.7709923664122137,
                                     'F1 Score': 0.7169811320754716,
                                     'Precision': 0.8085106382978723,
                                     'Recall': 0.6440677966101694},
 'Regresión Logística_median': {'Accuracy': 0.7748091603053435,
                                'F1 Score': 0.7203791469194313,
                                'Precision': 0.8172043010752689,
                  

In [46]:
df_resultados = pd.DataFrame(resultados).T.sort_values(by='F1 Score', ascending=False)


df_resultados

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Random Forest_mean_scaled,0.793893,0.826531,0.686441,0.75
Random Forest_median,0.782443,0.814433,0.669492,0.734884
Regresión Logística_median,0.774809,0.817204,0.644068,0.720379
Regresión Logística_mean_scaled,0.770992,0.808511,0.644068,0.716981


# Validación cruzada con 5 folds y visualización de resultados usando plotly

In [47]:
from sklearn.model_selection import cross_val_score

In [52]:
df_cv_results = pd.DataFrame(columns = ['pipeline', 'fold', 'accuracy'])

cv_fold = 10

In [53]:
for pipeline_name, pipeline_obj in pipelines.items():
    cv_scores = cross_val_score(pipeline_obj, X_train, y_train, cv=cv_fold, scoring='accuracy')
    
    temp_df = pd.DataFrame({
        'pipeline': [pipeline_name]*cv_fold,
        'fold': list(range(1, cv_fold+1)),
        'accuracy': cv_scores
    })

    df_cv_results = pd.concat([df_cv_results, temp_df], ignore_index=True)

In [54]:
df_cv_results.head(40)

Unnamed: 0,pipeline,fold,accuracy
0,Regresión Logística_median,1,0.847619
1,Regresión Logística_median,2,0.8
2,Regresión Logística_median,3,0.742857
3,Regresión Logística_median,4,0.838095
4,Regresión Logística_median,5,0.771429
5,Regresión Logística_median,6,0.847619
6,Regresión Logística_median,7,0.8
7,Regresión Logística_median,8,0.759615
8,Regresión Logística_median,9,0.807692
9,Regresión Logística_median,10,0.721154


In [55]:
import plotly.express as px

fig = px.box(
    df_cv_results, 
    x='pipeline', 
    y='accuracy', 
    points = 'all',
    title='Distribución de Accuracy por Pipeline en Validación Cruzada',
    labels={'accuracy': 'Accuracy', 'pipeline': 'Pipeline'}
)

fig.show()

In [57]:
from sklearn.model_selection import GridSearchCV

In [58]:
pipeline_rf_mean_scaled = Pipeline(steps=[
    ('preprocessor', preprocessor_mean_scale),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [60]:
param_grid = {
    'classifier__n_estimators': [50, 100, 200], # número de árboles
    'classifier__max_depth': [None, 10, 20], # profundidad máxima de los árboles
    'classifier__min_samples_split': [2, 5, 10] # número mínimo de muestras para dividir un nodo
}

In [61]:
grid_search = GridSearchCV(
    estimator=pipeline_rf_mean_scaled,
    param_grid=param_grid,
    cv=10,
    scoring='accuracy',
    n_jobs=-1, # Usar todos los núcleos disponibles
)

In [62]:
grid_search.fit(X_train, y_train)

In [63]:
grid_search.best_params_

{'classifier__max_depth': 10,
 'classifier__min_samples_split': 10,
 'classifier__n_estimators': 50}

In [64]:
grid_search.best_score_

0.8232234432234431

In [67]:
DATA_DIR

PosixPath('/Users/david.palacio/Documents/academia/titanic-prediccion/modelos')

In [66]:
import joblib

best_model = grid_search.best_estimator_

DATA_DIR = Path.cwd().resolve().parent / "modelos"

joblib.dump(best_model, DATA_DIR / "randomforest_best_model.joblib")

['/Users/david.palacio/Documents/academia/titanic-prediccion/modelos/randomforest_best_model.joblib']

# CARGAR EL MODELO Y USO EN DATOS NO VISTOS

In [68]:
loaded_model = joblib.load(DATA_DIR / "randomforest_best_model.joblib")

In [69]:
type(loaded_model)

sklearn.pipeline.Pipeline

In [70]:
np.random.seed(57)


n_samples = 50

df_synthetic_test = pd.DataFrame({
    'pclass': np.random.choice([1, 2, 3], size=n_samples),
    'sex': np.random.choice(['male', 'female'], size=n_samples),
    'age': np.random.uniform(0, 80, size=n_samples),
    'sibsp': np.random.randint(0, 10, size=n_samples),
    'parch': np.random.randint(0, 4, size=n_samples),
    'fare': np.random.uniform(10, 2000, size=n_samples),
    'embarked': np.random.choice(['C', 'Q', 'S'], size=n_samples)
})

df_synthetic_test.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
0,3,male,29.549676,6,0,1535.239303,Q
1,2,male,22.782733,6,2,913.722493,C
2,3,male,74.169372,3,3,277.143499,C
3,1,male,62.041734,4,0,527.476471,Q
4,3,male,51.305236,2,1,717.351105,S


In [71]:
y_pred_sintetic = loaded_model.predict(df_synthetic_test)

In [72]:
y_pred_sintetic

array([False, False, False, False, False, False, False, False,  True,
       False,  True, False, False, False, False,  True, False,  True,
        True, False,  True, False,  True, False, False, False,  True,
       False, False, False, False, False,  True, False,  True,  True,
       False,  True,  True, False,  True, False, False,  True,  True,
       False,  True,  True, False, False])

In [74]:
df_predictions = df_synthetic_test.copy()

df_predictions['¿Sobrevivió?'] = y_pred_sintetic

df_predictions.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,¿Sobrevivió?
0,3,male,29.549676,6,0,1535.239303,Q,False
1,2,male,22.782733,6,2,913.722493,C,False
2,3,male,74.169372,3,3,277.143499,C,False
3,1,male,62.041734,4,0,527.476471,Q,False
4,3,male,51.305236,2,1,717.351105,S,False
