### Ensemble: 
- compare the original ensemble approach (similar to bagging ensemble, but with alternatives in creating multiple GBT models by changing seeds instead of boosting the same models iteratively) with ensemble using Stacking technique. 

In [None]:
train_df = pd.read_csv("/content/train.csv")
serving_df = pd.read_csv("/content/test.csv")

train_df.head(10)

In [None]:
def preprocess(df):
    df = df.copy()

    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])

    def ticket_number(x):
        return x.split(" ")[-1]

    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])

    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)
    return df

preprocessed_train_df = preprocess(train_df)
preprocessed_serving_df = preprocess(serving_df)

preprocessed_train_df.head(5)

In [None]:
input_features = list(preprocessed_train_df.columns)
input_features.remove("Ticket")
input_features.remove("PassengerId")
input_features.remove("Survived")
#input_features.remove("Ticket_number")

print(f"Input features: {input_features}")

In [None]:
# split train set to train and test
from sklearn.model_selection import train_test_split

# Split the dataset
train_df, valid_df = train_test_split(preprocessed_train_df, test_size=0.2, random_state=42)

def tokenize_names(features, labels=None):
    """Divite the names into tokens. TF-DF can consume text tokens natively."""
    features["Name"] =  tf.strings.split(features["Name"])
    return features, labels

# Create TensorFlow datasets
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label="Survived").map(tokenize_names)
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_df, label="Survived").map(tokenize_names)
serving_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_serving_df).map(tokenize_names)


In [None]:
from sklearn.metrics import accuracy_score, log_loss

predictions = None
num_predictions = 0

for i in range(100):
    print(f"i:{i}")
    # Possible models: GradientBoostedTreesModel or RandomForestModel
    model = tfdf.keras.GradientBoostedTreesModel(
        verbose=0, # Very few logs
        features=[tfdf.keras.FeatureUsage(name=n) for n in input_features],
        exclude_non_specified_features=True, # Only use the features in "features"

        #min_examples=1,
        #categorical_algorithm="RANDOM",
        ##max_depth=4,
        #shrinkage=0.05,
        ##num_candidate_attributes_ratio=0.2,
        #split_axis="SPARSE_OBLIQUE",
        #sparse_oblique_normalization="MIN_MAX",
        #sparse_oblique_num_projections_exponent=2.0,
        #num_trees=2000,
        ##validation_ratio=0.0,
        random_seed=i,
        honest=True,
    )
    model.fit(train_ds, validation_data=valid_ds)
    
    sub_predictions = model.predict(valid_ds, verbose=0)[:,0]
    if predictions is None:
        predictions = sub_predictions
    else:
        predictions += sub_predictions
    num_predictions += 1

predictions/=num_predictions
print(predictions)


In [None]:
# Convert averaged predictions to final class labels (for classification)
final_predictions = (predictions > 0.5).astype(int)  # For binary classification

# Extract true labels from valid_ds
y_valid = np.concatenate([y.numpy() for x, y in valid_ds])

# Calculate accuracy
accuracy = accuracy_score(y_valid, final_predictions)

# Calculate log loss (requires probability predictions)
loss = log_loss(y_valid, predictions)

print(f"Ensemble Accuracy: {accuracy:.4f}")
print(f"Ensemble Log Loss: {loss:.4f}")

In [None]:
Ensemble Accuracy: 0.8101
Ensemble Log Loss: 0.4051

In [None]:
import numpy as np
import pandas as pd

y_true = np.concatenate([y.numpy() for x, y in valid_ds])
y_prob = predictions
y_pred = (y_prob > 0.5).astype(int)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Not Survived", "Survived"])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix - Ensemble Model")
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# Plot Classification Metrics
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
values = [accuracy, precision, recall, f1]

plt.figure(figsize=(6, 4))
plt.bar(metrics, values, color=['blue', 'green', 'orange', 'red'])
plt.title("Classification Metrics")
plt.ylabel("Score")
plt.ylim(0, 1)
plt.show()

# Print metrics for reference
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [None]:
from sklearn.metrics import roc_curve, auc

# Compute ROC curve
fpr, tpr, _ = roc_curve(y_true, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color="blue", lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--")  # Diagonal line (random classifier)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.show()

### Bagging

import numpy as np
import tensorflow_decision_forests as tfdf
import tensorflow as tf
from sklearn.metrics import accuracy_score, log_loss

# Number of models in the bagging ensemble
num_models = 10

# Store predictions from each model
bagging_predictions = []
random_seeds = [int(seed) for seed in np.random.randint(0, 10000, num_models)] # Different seeds for bootstrapping

for i, seed in enumerate(random_seeds):
    print(f"Training model {i+1}/{num_models} with seed {seed}")

    # Create a new model with different data sampling
    model = tfdf.keras.GradientBoostedTreesModel(
        random_seed=random_seeds[i], 
        verbose=0
    )

    # Train the model on the training dataset
    model.fit(train_ds, validation_data=valid_ds)

    # Predict on the validation dataset
    sub_predictions = model.predict(valid_ds, verbose=0)[:, 0]  # Get probability scores
    bagging_predictions.append(sub_predictions)

# Convert list to NumPy array for averaging
bagging_predictions = np.array(bagging_predictions)
y_prob = np.mean(bagging_predictions, axis=0)  # Average over models

# Convert probabilities to class predictions (threshold at 0.5 for binary classification)
y_pred = (y_prob > 0.5).astype(int)

# Extract true labels from valid_ds
y_true = np.concatenate([y.numpy() for x, y in valid_ds])

# Compute accuracy and log loss
accuracy = accuracy_score(y_true, y_pred)
logloss = log_loss(y_true, y_prob)

print(f"Bagging Ensemble Accuracy: {accuracy:.4f}")
print(f"Bagging Ensemble Log Loss: {logloss:.4f}")

In [None]:
Bagging Ensemble Accuracy: 0.8156
Bagging Ensemble Log Loss: 0.4183

### Stacking with logistic regression as meta model


In [None]:
import numpy as np
import tensorflow_decision_forests as tfdf
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss

# Number of base models in the stacking ensemble
num_models = 5  

# Generate random seeds for model variation
random_seeds = [int(seed) for seed in np.random.randint(0, 10000, num_models)]  

# Store base model predictions
base_model_predictions = []

for i, seed in enumerate(random_seeds):
    print(f"Training base model {i+1}/{num_models} with seed {seed}")

    # Train a base model with different random seeds
    model = tfdf.keras.GradientBoostedTreesModel(
        random_seed=seed, 
        verbose=0
    )

    # Train the model on training dataset
    model.fit(train_ds, validation_data=valid_ds)

    # Predict on the validation dataset (probabilities)
    sub_predictions = model.predict(valid_ds, verbose=0)[:, 0]  # Get probability scores
    base_model_predictions.append(sub_predictions)

# Convert to NumPy array (shape: [num_models, num_samples])
base_model_predictions = np.array(base_model_predictions).T  # Transpose to shape (num_samples, num_models)

# Extract true labels from valid_ds
y_true = np.concatenate([y.numpy() for x, y in valid_ds])

# Train a meta-model (Logistic Regression)
meta_model = LogisticRegression()
meta_model.fit(base_model_predictions, y_true)

# Make final predictions using the meta-model
y_prob = meta_model.predict_proba(base_model_predictions)[:, 1]  # Probability of class 1
y_pred = (y_prob > 0.5).astype(int)  # Convert to binary predictions

# Compute accuracy and log loss
accuracy = accuracy_score(y_true, y_pred)
logloss = log_loss(y_true, y_prob)

print(f"Stacking Ensemble Accuracy: {accuracy:.4f}")
print(f"Stacking Ensemble Log Loss: {logloss:.4f}")

In [None]:
Stacking Ensemble Accuracy: 0.8268
Stacking Ensemble Log Loss: 0.4156