# **TITANIC - TensorFlow con KERAS**

---

### **01 - Importar librerías, configurar seeds**

In [1]:
# Carga de librerías generales
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Para splits y métricas
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Para imputación con árboles
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder

# Keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense

# Fijar semillas para reproducibilidad
import random
tf.random.set_seed(1)
np.random.seed(1)
random.seed(1)

import warnings
warnings.filterwarnings("ignore")  # Para ocultar warnings

print("Librerías importadas y semillas configuradas.")


2025-01-15 09:38:27.505789: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Librerías importadas y semillas configuradas.


---

### **02 - Carga de datos y unificación**

In [2]:
# Carga de train y test
train = pd.read_csv("data/titanic/train.csv", sep=",")
test  = pd.read_csv("data/titanic/test.csv", sep=",")

print("Dimensiones de train:", train.shape)
print("Dimensiones de test: ", test.shape)

# Añadir la columna 'Survived' a test para unificar datos
test['Survived'] = np.nan

# Combinar datasets
full_data = pd.concat([train, test], ignore_index=True, axis=0)
print("Dimensiones de full_data (train + test unificados):", full_data.shape)

display(train.head(3))
display(test.head(3))
full_data.head(3)


Dimensiones de train: (891, 12)
Dimensiones de test:  (418, 11)
Dimensiones de full_data (train + test unificados): (1309, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


---

### **03 - Feature Engineering**

In [None]:
# =====================================================================
# 1) Título de 'Name'
# =====================================================================
full_data['Title'] = full_data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

# Agrupar títulos poco comunes
rare_titles = ['Lady','Countess','Capt','Col','Don','Major','Rev','Sir','Jonkheer','Dona']
full_data['Title'] = full_data['Title'].replace(rare_titles, 'Rare')
full_data['Title'] = full_data['Title'].replace({'Mlle':'Miss', 'Ms':'Miss', 'Mme':'Mrs'})

full_data['Title'] = full_data['Title'].astype('category')


# =====================================================================
# 2) FamilySize y FamilyType
# =====================================================================
full_data['FamilySize'] = full_data['SibSp'] + full_data['Parch'] + 1
full_data['IsAlone'] = (full_data['FamilySize'] == 1).astype(int)

def family_type(x):
    if x == 1:
        return 'Solo'
    elif x <= 4:
        return 'Small'
    else:
        return 'Large'
full_data['FamilyType'] = full_data['FamilySize'].apply(family_type).astype('category')


# =====================================================================
# 3) Deck (opcional) a partir de 'Cabin'
# =====================================================================
full_data['Cabin'] = full_data['Cabin'].fillna("U")  # temporal
full_data['Deck'] = full_data['Cabin'].map(lambda x: str(x)[0])
# full_data.loc[full_data['Deck'] == 'U', 'Deck'] = np.nan
full_data['Deck'] = full_data['Deck'].astype('category')


# =====================================================================
# 4) Extraer otras features
# =====================================================================
full_data['NameLength'] = full_data['Name'].apply(len)


# =====================================================================
# 5) Imputar Embarked (2 nulos) con DecisionTreeClassifier
# =====================================================================

embarked_null = full_data[full_data['Embarked'].isnull()].copy()
embarked_not_null = full_data[full_data['Embarked'].notnull()].copy()

# Variables para predecir 'Embarked'
features_embarked = ['Pclass','Sex','Fare','Age','Title','FamilySize']

# Codificamos 'Sex' y 'Title' en embarked_not_null
label_encoders = {}

for col in ['Sex','Title']:
    le = LabelEncoder()
    embarked_not_null[col] = embarked_not_null[col].astype(str)
    le.fit(embarked_not_null[col])
    embarked_not_null[col] = le.transform(embarked_not_null[col])
    label_encoders[col] = le

tree_embarked = DecisionTreeClassifier(random_state=123)
tree_embarked.fit(embarked_not_null[features_embarked], embarked_not_null['Embarked'])

# Ahora transformamos embarked_null
for col in ['Sex','Title']:
    embarked_null[col] = embarked_null[col].astype(str)
    embarked_null[col] = label_encoders[col].transform(embarked_null[col])

pred_embarked = tree_embarked.predict(embarked_null[features_embarked])
full_data.loc[full_data['Embarked'].isnull(), 'Embarked'] = pred_embarked

full_data['Embarked'] = full_data['Embarked'].astype('category')


# =====================================================================
# 6) Imputar Fare 
# =====================================================================
full_data['Fare'] = full_data['Fare'].fillna(full_data['Fare'].median())


# =====================================================================
# 7) Imputar Age con un árbol de regresión (más complejo que la mediana)
#    -> Para mayor exactitud, usamos variables: Pclass, Sex, SibSp, Parch,
#       Fare, Embarked, Title, FamilySize, ...
# =====================================================================

age_null = full_data[full_data['Age'].isnull()].copy()
age_not_null = full_data[full_data['Age'].notnull()].copy()

# Variables para predecir Age
features_age = [
    'Pclass','Sex','SibSp','Parch','Fare','Embarked',
    'Title','FamilySize','IsAlone','FamilyType','Deck','NameLength'
]
# Convertimos todo a strings o dummies
def preprocess_for_age(df):
    temp = df.copy()
    # Convertir col categóricas en label numérico rápido (o pd.get_dummies)
    # A efectos de simplicidad:
    cols_cat = ['Sex','Embarked','Title','FamilyType','Deck']
    for c in cols_cat:
        temp[c] = temp[c].astype(str)
        le2 = LabelEncoder()
        temp[c] = le2.fit_transform(temp[c])
    return temp

age_not_null_proc = preprocess_for_age(age_not_null)
age_null_proc     = preprocess_for_age(age_null)

X_age_train = age_not_null_proc[features_age]
y_age_train = age_not_null_proc['Age']

tree_age = DecisionTreeRegressor(random_state=123, max_depth=8)
tree_age.fit(X_age_train, y_age_train)

X_age_test = age_null_proc[features_age]
pred_age = tree_age.predict(X_age_test)

full_data.loc[full_data['Age'].isnull(), 'Age'] = pred_age


# =====================================================================
# 8) Crear AgeClass y FarePerPerson
# =====================================================================
full_data['AgeClass'] = full_data['Age'] * full_data['Pclass']
full_data['FarePerPerson'] = full_data['Fare'] / full_data['FamilySize']


# =====================================================================
# 9) Revisar nulos finales
# =====================================================================
print("\nValores faltantes al final:\n", full_data.isnull().sum())

# =====================================================================
# 10) Eliminar columnas no necesarias
# =====================================================================
full_data.drop(columns=['Name','Ticket','Cabin','PassengerId'], inplace=True, errors='ignore')

print("Feature engineering e imputaciones completadas.")


Valores faltantes al final:
 PassengerId        0
Survived         418
Pclass             0
Name               0
Sex                0
Age                0
SibSp              0
Parch              0
Ticket             0
Fare               0
Cabin              0
Embarked           0
Title              0
FamilySize         0
IsAlone            0
FamilyType         0
Deck               0
NameLength         0
AgeClass           0
FarePerPerson      0
dtype: int64
Feature engineering (avanzado) e imputaciones completadas.


---

In [25]:
full_data.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked', 'Title', 'FamilySize', 'IsAlone', 'FamilyType', 'Deck',
       'NameLength', 'AgeClass', 'FarePerPerson'],
      dtype='object')

In [28]:
full_data["AgeClass"]

0        66.000000
1        38.000000
2        78.000000
3        35.000000
4       105.000000
           ...    
1304     89.690722
1305     39.000000
1306    115.500000
1307     89.690722
1308     17.942308
Name: AgeClass, Length: 1309, dtype: float64

### **04 - Separar nuevamente en `train_data` y `test_data`**

In [5]:
n_train = train.shape[0]
train_data = full_data.iloc[:n_train, :].copy()
test_data  = full_data.iloc[n_train:, :].copy()

print("Dimensiones train_data:", train_data.shape)
print("Dimensiones test_data :", test_data.shape)

# Convertir Survived a entero en train_data
train_data['Survived'] = train_data['Survived'].astype(int)

train_data.head(3)

Dimensiones train_data: (891, 16)
Dimensiones test_data : (418, 16)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone,FamilyType,Deck,NameLength,AgeClass,FarePerPerson
0,0,3,male,22.0,1,0,7.25,S,Mr,2,0,Small,U,23,66.0,3.625
1,1,1,female,38.0,1,0,71.2833,C,Mrs,2,0,Small,C,51,38.0,35.64165
2,1,3,female,26.0,0,0,7.925,S,Miss,1,1,Solo,U,22,78.0,7.925


---

### **05 - Crear `X_dummies` y `X_test_dummies`**

In [6]:
X = train_data.drop(columns=['Survived'])
y = train_data['Survived']

print("Tamaño X:", X.shape)
print("Tamaño y:", y.shape)

# Crear dummies (One-Hot)
X_dummies = pd.get_dummies(X, drop_first=False)
print("Tamaño X_dummies:", X_dummies.shape)

# Lo mismo para test
X_test_real = test_data.drop(columns=['Survived'])
X_test_dummies = pd.get_dummies(X_test_real, drop_first=False)

# Alinear columnas
X_test_dummies = X_test_dummies.reindex(columns=X_dummies.columns, fill_value=0)
print("Tamaño X_test_dummies:", X_test_dummies.shape)

# Dividir un pequeño hold-out de validación
X_train, X_val, y_train, y_val = train_test_split(
    X_dummies, y,
    test_size=0.2,
    random_state=123,
    shuffle=True
)
print("Tamaño X_train:", X_train.shape, "Tamaño X_val:", X_val.shape)

Tamaño X: (891, 15)
Tamaño y: (891,)
Tamaño X_dummies: (891, 34)
Tamaño X_test_dummies: (418, 34)
Tamaño X_train: (712, 34) Tamaño X_val: (179, 34)


---

### **06 - Definir función de construcción y evaluación del modelo (Keras)**

In [7]:
def build_and_evaluate_model(X_tr, y_tr, X_v, y_v,
                             hidden_layers, activation,
                             learning_rate, batch_size, epochs):
    """
    Construye y entrena un modelo Keras secuencial según los hiperparámetros.
    Retorna la métrica accuracy en validación.
    """
    model = Sequential()
    
    # Primera capa con input
    model.add(Dense(hidden_layers[0], activation=activation, input_dim=X_tr.shape[1]))
    
    # Resto de capas ocultas
    for units in hidden_layers[1:]:
        model.add(Dense(units, activation=activation))
    
    # Capa de salida
    model.add(Dense(1, activation='sigmoid'))
    
    # Compilar
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    # Entrenar
    model.fit(X_tr, y_tr, epochs=epochs, batch_size=batch_size, verbose=0)
    
    # Evaluar en validación
    loss_val, acc_val = model.evaluate(X_v, y_v, verbose=0)
    return acc_val

---

### **07 - Búsqueda manual de hiperparámetros**

In [None]:
######### Time computation: 5min 5.3s

hidden_layer_options = [
    [12, 8],
    [16, 8, 4],
    [32, 16],
    [64, 32, 16]
]
activations = ['relu', 'tanh']
learning_rates = [0.01, 0.001]
batch_sizes = [16, 32]
epochs_list = [50, 100]

best_acc = 0
best_config = None

for hl in hidden_layer_options:
    for act in activations:
        for lr in learning_rates:
            for bs in batch_sizes:
                for ep in epochs_list:
                    acc_val = build_and_evaluate_model(
                        X_train, y_train,
                        X_val, y_val,
                        hidden_layers=hl,
                        activation=act,
                        learning_rate=lr,
                        batch_size=bs,
                        epochs=ep
                    )
                    
                    if acc_val > best_acc:
                        best_acc = acc_val
                        best_config = (hl, act, lr, bs, ep)

print("Mejor configuración encontrada:", best_config)
print("Mejor accuracy (validación):", round(best_acc, 4))


Mejor configuración encontrada: ([16, 8, 4], 'relu', 0.01, 16, 50)
Mejor accuracy (validación): 0.8715


---

### **Entrenar modelo final con la mejor configuración con TODOS los datos**

In [9]:
best_hidden_layers, best_act, best_lr, best_bs, best_ep = best_config

print("Entrenando modelo final con:")
print(f" - Capas ocultas: {best_hidden_layers}")
print(f" - Activación: {best_act}")
print(f" - LearningRate: {best_lr}")
print(f" - BatchSize: {best_bs}")
print(f" - Epochs: {best_ep}")

model_final = Sequential()

# Primera capa
model_final.add(Dense(best_hidden_layers[0], activation=best_act, input_dim=X_dummies.shape[1]))
# Resto
for units in best_hidden_layers[1:]:
    model_final.add(Dense(units, activation=best_act))
# Salida
model_final.add(Dense(1, activation='sigmoid'))

# Compilar
model_final.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=best_lr),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Entrenar con TODOS los datos
model_final.fit(X_dummies, y, epochs=best_ep, batch_size=best_bs, verbose=1)

# Evaluar en todo el train
loss_train, acc_train = model_final.evaluate(X_dummies, y, verbose=0)
print(f"Accuracy final en train_data: {acc_train:.4f}")


Entrenando modelo final con:
 - Capas ocultas: [16, 8, 4]
 - Activación: relu
 - LearningRate: 0.01
 - BatchSize: 16
 - Epochs: 50
Epoch 1/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5051 - loss: 1.4256
Epoch 2/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7335 - loss: 0.6031
Epoch 3/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7748 - loss: 0.5461
Epoch 4/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7984 - loss: 0.5118
Epoch 5/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8168 - loss: 0.4757  
Epoch 6/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8339 - loss: 0.4481
Epoch 7/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8336 - loss: 0.4323  
Epoch 8/50
[1m56/56[0m [32m━━━━━━━━━

### **09 - Predicciones y Submission**

In [10]:
# Predicciones finales en test
pred_prob_test = model_final.predict(X_test_dummies)
pred_class_test = (pred_prob_test > 0.5).astype(int).ravel()

# Crear submission
original_test = pd.read_csv("data/titanic/test.csv", sep=",")
submission = pd.DataFrame({
    'PassengerId': original_test['PassengerId'],
    'Survived': pred_class_test
})

submission.to_csv("submission_TF_Keras_NEW_01.csv", index=False)
print("Archivo submission_TF_Keras_NEW_01.csv guardado.")
submission.head(5)


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Archivo submission_TF_Keras_NEW_01.csv guardado.


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [15]:
submission["Survived"].value_counts()

Survived
0    302
1    116
Name: count, dtype: int64

In [22]:
submission.sample(30)


Unnamed: 0,PassengerId,Survived
115,1007,0
39,931,0
293,1185,0
159,1051,0
224,1116,1
324,1216,1
60,952,0
131,1023,0
65,957,1
270,1162,0


---

## **10 - Permutation Importance**

In [16]:
from sklearn.inspection import permutation_importance
from sklearn.base import BaseEstimator

# Definir un wrapper para el modelo Keras
class KerasWrapper(BaseEstimator):
    def __init__(self, model):
        self.model = model
        
    def fit(self, X, y):
        # No hacemos nada aquí porque el modelo ya está entrenado
        return self
    
    def predict(self, X):
        # Asegurarnos de devolver las predicciones en formato adecuado
        prob = self.model.predict(X)  # Predicciones en probabilidad
        return (prob > 0.5).astype(int).ravel()  # Convertir a etiquetas binarias

# Crear el wrapper para el modelo final
wrapper = KerasWrapper(model_final)

# Calcular permutation importance
perm_result = permutation_importance(
    estimator=wrapper,
    X=X_dummies,
    y=y,
    scoring='f1',
    n_repeats=5,
    random_state=123
)

# Crear un DataFrame con los resultados
feat_imp = pd.DataFrame({
    'feature': X_dummies.columns,
    'importance_mean': perm_result.importances_mean,
    'importance_std': perm_result.importances_std
}).sort_values('importance_mean', ascending=False)

feat_imp.reset_index(drop=True, inplace=True)
print(feat_imp.head(15))


[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 927us/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 974us/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 916us/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 817us/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━

In [None]:
# Verificar el wrapper
test_predictions = wrapper.predict(X_dummies.iloc[:10])
print(test_predictions)  


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[0 1 0 1 0 0 0 0 0 1]


In [18]:
print("Columnas de X_dummies:", X_dummies.columns)


Columnas de X_dummies: Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone',
       'NameLength', 'AgeClass', 'FarePerPerson', 'Sex_female', 'Sex_male',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Dr', 'Title_Master',
       'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare',
       'Title_the Countess', 'FamilyType_Large', 'FamilyType_Small',
       'FamilyType_Solo', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E',
       'Deck_F', 'Deck_G', 'Deck_T', 'Deck_U'],
      dtype='object')


In [19]:
print(perm_result.importances)  # Matriz con las importancias en cada repetición


[[-0.00722533 -0.00296848  0.00083479 -0.00854885 -0.00774942]
 [ 0.10495448  0.05959022  0.07076149  0.06771718  0.05433559]
 [ 0.00780343  0.0021558   0.00563406  0.0108545  -0.00264954]
 [ 0.00953947  0.00735187  0.00953947  0.00953947  0.0021558 ]
 [ 0.02136238  0.03475295  0.03121313  0.0289679   0.0289679 ]
 [-0.00347222 -0.00347222  0.00299956 -0.00082901 -0.00132246]
 [ 0.00347222  0.00997419 -0.00082901  0.00384924 -0.00561453]
 [ 0.0121408   0.01825639 -0.0019765   0.03004124  0.01041667]
 [ 0.20906532  0.19622137  0.21593268  0.18349359  0.207832  ]
 [ 0.04588076  0.03790358  0.03219367  0.04637854  0.04440978]
 [ 0.08394608  0.07177198  0.07767084  0.09375     0.09867914]
 [ 0.0108545   0.00048575  0.          0.00478408  0.00694444]
 [ 0.0021558   0.01566142  0.01614859  0.01216492  0.00347222]
 [ 0.01745892  0.02282801  0.02450117  0.01173477  0.01262125]
 [ 0.00866841  0.          0.00347222  0.0021558   0.00431911]
 [ 0.00131788  0.00347222  0.00131788  0.          0.00

In [26]:
import numpy as np
import pandas as pd
import plotly.express as px

# ================================
# 1. Conversión de importances a DataFrame
# ================================
# Caso típico:
# - Filas = 34 features
# - Columnas = 5 repeticiones
# => Asignamos las variables como 'index' y las repeticiones como 'columns'.
importances_df = pd.DataFrame(
    perm_result.importances,
    index=X_dummies.columns,               # Nombres de las 34 características
    columns=[f"Rep_{i+1}" for i in range(5)]  # 5 repeticiones
)

# ================================
# 2. Calcular la importancia media de cada variable
# ================================
# Hacemos la media por fila (axis=1), ya que cada fila es una feature.
importances_mean = importances_df.mean(axis=1)

# ================================
# 3. Crear un DataFrame con (Feature, Importance)
# ================================
importances_summary = pd.DataFrame({
    'Feature': importances_mean.index,
    'Importance': importances_mean.values
})

# ================================
# 4. Ordenar y seleccionar top 10
# ================================
top_10_features = importances_summary.sort_values(
    by='Importance',
    ascending=False
).head(10)

# ================================
# 5. Graficar con Plotly (barras horizontales)
# ================================
fig = px.bar(
    top_10_features,
    x='Importance',
    y='Feature',
    orientation='h',  # Barras horizontales
    title='Top 10 Características Más Importantes',
    labels={
        'Feature': 'Características',
        'Importance': 'Importancia'
    },
    text='Importance'
)

# ================================
# 6. Ajustar el diseño para que la barra más importante aparezca arriba
# ================================
fig.update_layout(
    yaxis=dict(autorange="reversed"),
    xaxis_title="Importancia",
    yaxis_title="Características"
)

# ================================
# 7. Mostrar la gráfica
# ================================
fig.show()


In [31]:
importances_summary_sorted = importances_summary.sort_values(
    by='Importance', 
    ascending=False
)

print(importances_summary_sorted.head(10))


             Feature  Importance
8           AgeClass    0.202509
10        Sex_female    0.085164
1                Age    0.071472
18          Title_Mr    0.049429
9      FarePerPerson    0.041353
4               Fare    0.029053
19         Title_Mrs    0.028045
16      Title_Master    0.025618
23  FamilyType_Small    0.025105
17        Title_Miss    0.022283


In [20]:
from sklearn.metrics import accuracy_score

# Métrica original
baseline_acc = accuracy_score(y, wrapper.predict(X_dummies))

# Permutamos una característica
X_permuted = X_dummies.copy()
X_permuted['Pclass'] = np.random.permutation(X_dummies['Pclass'])

permuted_acc = accuracy_score(y, wrapper.predict(X_permuted))
print(f"Accuracy original: {baseline_acc}, Accuracy permutado: {permuted_acc}")


[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 783us/step
Accuracy original: 0.8451178451178452, Accuracy permutado: 0.8484848484848485
