# **TITANIC - TensorFlow con KERAS**

---

### **01 - Importar librerías, configurar seeds**

In [40]:
# Carga de librerías generales
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Para splits y métricas
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Para imputación con árboles
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder

# Keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense

# Fijar semillas para reproducibilidad
import random
tf.random.set_seed(1)
np.random.seed(1)
random.seed(1)

import warnings
warnings.filterwarnings("ignore")  # Para ocultar warnings

print("Librerías importadas y semillas configuradas.")


Librerías importadas y configuración lista.


---

### **02 - Carga de datos y unificación**

In [None]:
# Carga de train y test
train = pd.read_csv("data/titanic/train.csv", sep=",")
test  = pd.read_csv("data/titanic/test.csv", sep=",")

print("Dimensiones de train:", train.shape)
print("Dimensiones de test: ", test.shape)

# Añadir la columna 'Survived' a test para unificar datos
test['Survived'] = np.nan

# Combinar datasets
full_data = pd.concat([train, test], ignore_index=True, axis=0)
print("Dimensiones de full_data (train + test unificados):", full_data.shape)

display(train.head(3))
display(test.head(3))
full_data.head(3)


Dimensiones de train: (891, 12)
Dimensiones de test: (418, 12)
Dimensiones de full_data: (1309, 12)


---

### **03 - Feature Engineering**

In [42]:
# =====================================================================
# 1) Título de 'Name'
# =====================================================================
full_data['Title'] = full_data['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

# Agrupar títulos poco comunes
rare_titles = ['Lady','Countess','Capt','Col','Don','Major','Rev','Sir','Jonkheer','Dona']
full_data['Title'] = full_data['Title'].replace(rare_titles, 'Rare')
full_data['Title'] = full_data['Title'].replace({'Mlle':'Miss', 'Ms':'Miss', 'Mme':'Mrs'})

full_data['Title'] = full_data['Title'].astype('category')


# =====================================================================
# 2) FamilySize y FamilyType
# =====================================================================
full_data['FamilySize'] = full_data['SibSp'] + full_data['Parch'] + 1
full_data['IsAlone'] = (full_data['FamilySize'] == 1).astype(int)

def family_type(x):
    if x == 1:
        return 'Solo'
    elif x <= 4:
        return 'Small'
    else:
        return 'Large'
full_data['FamilyType'] = full_data['FamilySize'].apply(family_type).astype('category')


# =====================================================================
# 3) Deck (opcional) a partir de 'Cabin'
# =====================================================================
# A veces aporta algo de señal
# Si la has droppeado, elimínalo.
full_data['Cabin'] = full_data['Cabin'].fillna("U")  # temporal
full_data['Deck'] = full_data['Cabin'].map(lambda x: str(x)[0])
# Convertir 'U' en np.nan si deseas
# full_data.loc[full_data['Deck'] == 'U', 'Deck'] = np.nan
full_data['Deck'] = full_data['Deck'].astype('category')


# =====================================================================
# 4) Extraer otras features
# =====================================================================
full_data['NameLength'] = full_data['Name'].apply(len)


# =====================================================================
# 5) Imputar Embarked (2 nulos) con DecisionTreeClassifier
# =====================================================================

embarked_null = full_data[full_data['Embarked'].isnull()].copy()
embarked_not_null = full_data[full_data['Embarked'].notnull()].copy()

# Variables para predecir 'Embarked'
features_embarked = ['Pclass','Sex','Fare','Age','Title','FamilySize']

# Codificamos 'Sex' y 'Title' en embarked_not_null
label_encoders = {}

for col in ['Sex','Title']:
    le = LabelEncoder()
    embarked_not_null[col] = embarked_not_null[col].astype(str)
    le.fit(embarked_not_null[col])
    embarked_not_null[col] = le.transform(embarked_not_null[col])
    label_encoders[col] = le

tree_embarked = DecisionTreeClassifier(random_state=123)
tree_embarked.fit(embarked_not_null[features_embarked], embarked_not_null['Embarked'])

# Ahora transformamos embarked_null
for col in ['Sex','Title']:
    embarked_null[col] = embarked_null[col].astype(str)
    embarked_null[col] = label_encoders[col].transform(embarked_null[col])

pred_embarked = tree_embarked.predict(embarked_null[features_embarked])
full_data.loc[full_data['Embarked'].isnull(), 'Embarked'] = pred_embarked

full_data['Embarked'] = full_data['Embarked'].astype('category')


# =====================================================================
# 6) Imputar Fare (si hay missing)
# =====================================================================
full_data['Fare'] = full_data['Fare'].fillna(full_data['Fare'].median())


# =====================================================================
# 7) Imputar Age con un árbol de regresión (más complejo que la mediana)
#    -> Para mayor exactitud, usamos variables: Pclass, Sex, SibSp, Parch,
#       Fare, Embarked, Title, FamilySize, ...
# =====================================================================

age_null = full_data[full_data['Age'].isnull()].copy()
age_not_null = full_data[full_data['Age'].notnull()].copy()

# Variables para predecir Age
features_age = [
    'Pclass','Sex','SibSp','Parch','Fare','Embarked',
    'Title','FamilySize','IsAlone','FamilyType','Deck','NameLength'
]
# Convertirlo todo a strings o dummies
def preprocess_for_age(df):
    temp = df.copy()
    # Convertir col categóricas en label numérico rápido (o pd.get_dummies)
    # A efectos de simplicidad:
    cols_cat = ['Sex','Embarked','Title','FamilyType','Deck']
    for c in cols_cat:
        temp[c] = temp[c].astype(str)
        le2 = LabelEncoder()
        temp[c] = le2.fit_transform(temp[c])
    return temp

age_not_null_proc = preprocess_for_age(age_not_null)
age_null_proc     = preprocess_for_age(age_null)

X_age_train = age_not_null_proc[features_age]
y_age_train = age_not_null_proc['Age']

tree_age = DecisionTreeRegressor(random_state=123, max_depth=8)
tree_age.fit(X_age_train, y_age_train)

X_age_test = age_null_proc[features_age]
pred_age = tree_age.predict(X_age_test)

full_data.loc[full_data['Age'].isnull(), 'Age'] = pred_age


# =====================================================================
# 8) Crear AgeClass y FarePerPerson
# =====================================================================
full_data['AgeClass'] = full_data['Age'] * full_data['Pclass']
full_data['FarePerPerson'] = full_data['Fare'] / full_data['FamilySize']


# =====================================================================
# 9) Revisar nulos finales
# =====================================================================
print("\nValores faltantes al final:\n", full_data.isnull().sum())

# =====================================================================
# 10) Eliminar columnas que ya no necesites
# =====================================================================
# Decides si dejar 'Cabin' o no
full_data.drop(columns=['Name','Ticket','Cabin','PassengerId'], inplace=True, errors='ignore')

print("Feature engineering (avanzado) e imputaciones completadas.")

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

---

In [18]:
full_data["AgeClass"]

0        66.000000
1        38.000000
2        78.000000
3        35.000000
4       105.000000
           ...    
1304     89.690722
1305     39.000000
1306    115.500000
1307     89.690722
1308     17.942308
Name: AgeClass, Length: 1309, dtype: float64

### **04 - Separar nuevamente en `train_data` y `test_data`**

In [None]:
n_train = train.shape[0]
train_data = full_data.iloc[:n_train, :].copy()
test_data  = full_data.iloc[n_train:, :].copy()

print("Dimensiones train_data:", train_data.shape)
print("Dimensiones test_data :", test_data.shape)

# Convertir Survived a entero en train_data
train_data['Survived'] = train_data['Survived'].astype(int)

train_data.head(3)

Dimensiones train_data: (891, 15)
Dimensiones test_data : (418, 15)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone,FamilyType,NameLength,AgeClass,FarePerPerson
0,0,3,male,22.0,1,0,7.25,S,Mr,2,0,Small,23,66.0,3.625
1,1,1,female,38.0,1,0,71.2833,C,Mrs,2,0,Small,51,38.0,35.64165
2,1,3,female,26.0,0,0,7.925,S,Miss,1,1,Solo,22,78.0,7.925
3,1,1,female,35.0,1,0,53.1,S,Mrs,2,0,Small,44,35.0,26.55
4,0,3,male,35.0,0,0,8.05,S,Mr,1,1,Solo,24,105.0,8.05


---

### **05 - Crear `X_dummies` y `X_test_dummies`**

In [19]:
X = train_data.drop(columns=['Survived'])
y = train_data['Survived']

print("Tamaño X:", X.shape)
print("Tamaño y:", y.shape)

# Crear dummies (One-Hot)
X_dummies = pd.get_dummies(X, drop_first=False)
print("Tamaño X_dummies:", X_dummies.shape)

# Lo mismo para test
X_test_real = test_data.drop(columns=['Survived'])
X_test_dummies = pd.get_dummies(X_test_real, drop_first=False)

# Alinear columnas
X_test_dummies = X_test_dummies.reindex(columns=X_dummies.columns, fill_value=0)
print("Tamaño X_test_dummies:", X_test_dummies.shape)

# Dividir un pequeño hold-out de validación
X_train, X_val, y_train, y_val = train_test_split(
    X_dummies, y,
    test_size=0.2,
    random_state=123,
    shuffle=True
)
print("Tamaño X_train:", X_train.shape, "Tamaño X_val:", X_val.shape)

Tamaño X: (891, 14)
Tamaño y: (891,)
Tamaño X_dummies: (891, 25)
Tamaño X_test_dummies: (418, 25)
Tamaño X_train: (712, 25) Tamaño X_val: (179, 25)


---

### **06 - Definir función de construcción y evaluación del modelo (Keras)**

In [20]:
def build_and_evaluate_model(X_tr, y_tr, X_v, y_v,
                             hidden_layers, activation,
                             learning_rate, batch_size, epochs):
    """
    Construye y entrena un modelo Keras secuencial según los hiperparámetros.
    Retorna la métrica accuracy en validación.
    """
    model = Sequential()
    
    # Primera capa con input
    model.add(Dense(hidden_layers[0], activation=activation, input_dim=X_tr.shape[1]))
    
    # Resto de capas ocultas
    for units in hidden_layers[1:]:
        model.add(Dense(units, activation=activation))
    
    # Capa de salida
    model.add(Dense(1, activation='sigmoid'))
    
    # Compilar
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    # Entrenar
    model.fit(X_tr, y_tr, epochs=epochs, batch_size=batch_size, verbose=0)
    
    # Evaluar en validación
    loss_val, acc_val = model.evaluate(X_v, y_v, verbose=0)
    return acc_val

---

### **07 - Búsqueda manual de hiperparámetros**

In [21]:
hidden_layer_options = [
    [12, 8],
    [16, 8, 4],
    [32, 16],
    [64, 32, 16]
]
activations = ['relu', 'tanh']
learning_rates = [0.01, 0.001]
batch_sizes = [16, 32]
epochs_list = [50, 100]

best_acc = 0
best_config = None

for hl in hidden_layer_options:
    for act in activations:
        for lr in learning_rates:
            for bs in batch_sizes:
                for ep in epochs_list:
                    acc_val = build_and_evaluate_model(
                        X_train, y_train,
                        X_val, y_val,
                        hidden_layers=hl,
                        activation=act,
                        learning_rate=lr,
                        batch_size=bs,
                        epochs=ep
                    )
                    
                    if acc_val > best_acc:
                        best_acc = acc_val
                        best_config = (hl, act, lr, bs, ep)

print("Mejor configuración encontrada:", best_config)
print("Mejor accuracy (validación):", round(best_acc, 4))


Mejor configuración encontrada: ([12, 8], 'relu', 0.01, 16, 50)
Mejor accuracy (validación): 0.6369


---

### **Entrenar modelo final con la mejor configuración con TODOS los datos**

In [22]:
best_hidden_layers, best_act, best_lr, best_bs, best_ep = best_config

print("Entrenando modelo final con:")
print(f" - Capas ocultas: {best_hidden_layers}")
print(f" - Activación: {best_act}")
print(f" - LearningRate: {best_lr}")
print(f" - BatchSize: {best_bs}")
print(f" - Epochs: {best_ep}")

model_final = Sequential()

# Primera capa
model_final.add(Dense(best_hidden_layers[0], activation=best_act, input_dim=X_dummies.shape[1]))
# Resto
for units in best_hidden_layers[1:]:
    model_final.add(Dense(units, activation=best_act))
# Salida
model_final.add(Dense(1, activation='sigmoid'))

# Compilar
model_final.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=best_lr),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Entrenar con TODOS los datos
model_final.fit(X_dummies, y, epochs=best_ep, batch_size=best_bs, verbose=1)

# Evaluar en todo el train
loss_train, acc_train = model_final.evaluate(X_dummies, y, verbose=0)
print(f"Accuracy final en train_data: {acc_train:.4f}")


Entrenando modelo final con:
 - Capas ocultas: [12, 8]
 - Activación: relu
 - LearningRate: 0.01
 - BatchSize: 16
 - Epochs: 50
Epoch 1/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6238 - loss: nan
Epoch 2/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6238 - loss: nan
Epoch 3/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6267 - loss: nan
Epoch 4/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6238 - loss: nan
Epoch 5/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6238 - loss: nan
Epoch 6/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6238 - loss: nan
Epoch 7/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 969us/step - accuracy: 0.6238 - loss: nan
Epoch 8/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

### **09 - Predicciones y Submission**

In [25]:
# Predicciones finales en test
pred_prob_test = model_final.predict(X_test_dummies)
pred_class_test = (pred_prob_test > 0.5).astype(int).ravel()

# Crear submission
original_test = pd.read_csv("data/titanic/test.csv", sep=",")
submission = pd.DataFrame({
    'PassengerId': original_test['PassengerId'],
    'Survived': pred_class_test
})

submission.to_csv("submission_TF_Keras_NEW_01.csv", index=False)
print("Archivo submission_TF_Keras_NEW_01.csv guardado.")
submission.head(5)


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Archivo submission_TF_Keras_NEW_01.csv guardado.


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [44]:
submission.tail(30)


Unnamed: 0,PassengerId,Survived
388,1280,0
389,1281,0
390,1282,0
391,1283,0
392,1284,0
393,1285,0
394,1286,0
395,1287,0
396,1288,0
397,1289,0


---

## **10 - Permutation Importance**

In [32]:
from sklearn.inspection import permutation_importance
from sklearn.base import BaseEstimator

# Definir un wrapper para el modelo Keras
class KerasWrapper(BaseEstimator):
    def __init__(self, model):
        self.model = model
        
    def fit(self, X, y):
        # No hacemos nada aquí porque el modelo ya está entrenado
        return self
    
    def predict(self, X):
        # Asegurarnos de devolver las predicciones en formato adecuado
        prob = self.model.predict(X)  # Predicciones en probabilidad
        return (prob > 0.5).astype(int).ravel()  # Convertir a etiquetas binarias

# Crear el wrapper para el modelo final
wrapper = KerasWrapper(model_final)

# Calcular permutation importance
perm_result = permutation_importance(
    estimator=wrapper,
    X=X_dummies,
    y=y,
    scoring='f1',
    n_repeats=5,
    random_state=123
)

# Crear un DataFrame con los resultados
feat_imp = pd.DataFrame({
    'feature': X_dummies.columns,
    'importance_mean': perm_result.importances_mean,
    'importance_std': perm_result.importances_std
}).sort_values('importance_mean', ascending=False)

feat_imp.reset_index(drop=True, inplace=True)
print(feat_imp.head(15))


[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 838us/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 763us/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 841us/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 771us/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 790us/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 775us/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 764us/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 765us/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 776us/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 895us/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m28/28[0m [32m━━━━━━━━━━

In [33]:
# Verificar el wrapper
test_predictions = wrapper.predict(X_dummies.iloc[:10])
print(test_predictions)  # Asegúrate de que las predicciones sean 0 o 1


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[0 0 0 0 0 0 0 0 0 0]


In [34]:
print("Columnas de X_dummies:", X_dummies.columns)


Columnas de X_dummies: Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone',
       'NameLength', 'AgeClass', 'FarePerPerson', 'Sex_female', 'Sex_male',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Dr', 'Title_Master',
       'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare',
       'Title_the Countess', 'FamilyType_Large', 'FamilyType_Small',
       'FamilyType_Solo'],
      dtype='object')


In [35]:
print(perm_result.importances)  # Matriz con las importancias en cada repetición


[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


In [36]:
from sklearn.metrics import accuracy_score

# Métrica original
baseline_acc = accuracy_score(y, wrapper.predict(X_dummies))

# Permutamos una característica
X_permuted = X_dummies.copy()
X_permuted['Pclass'] = np.random.permutation(X_dummies['Pclass'])

permuted_acc = accuracy_score(y, wrapper.predict(X_permuted))
print(f"Accuracy original: {baseline_acc}, Accuracy permutado: {permuted_acc}")


[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 870us/step
Accuracy original: 0.6161616161616161, Accuracy permutado: 0.6161616161616161
