In [None]:
import pandas as _hex_pandas
import datetime as _hex_datetime
import json as _hex_json

In [None]:
hex_scheduled = _hex_json.loads("false")

In [None]:
hex_user_email = _hex_json.loads("\"example-user@example.com\"")

In [None]:
hex_user_attributes = _hex_json.loads("{}")

In [None]:
hex_run_context = _hex_json.loads("\"logic\"")

In [None]:
hex_timezone = _hex_json.loads("\"UTC\"")

In [None]:
hex_project_id = _hex_json.loads("\"019b140d-ba31-7557-bd26-114cce2d44b3\"")

In [None]:
hex_project_name = _hex_json.loads("\"Explainable Survival: From Prediction to Trust in Hex\\n\"")

In [None]:
hex_status = _hex_json.loads("\"\"")

In [None]:
hex_categories = _hex_json.loads("[]")

In [None]:
hex_color_palette = _hex_json.loads("[\"#4C78A8\",\"#F58518\",\"#E45756\",\"#72B7B2\",\"#54A24B\",\"#EECA3B\",\"#B279A2\",\"#FF9DA6\",\"#9D755D\",\"#BAB0AC\"]")

In [None]:
!uv pip install shap

[2mUsing Python 3.11.14 environment at: /home/hexuser/.cache/pypoetry/virtualenvs/python-kernel-OtKFaj5M-py3.11[0m
[2K[2mResolved [1m18 packages[0m [2min 224ms[0m[0m                                        [0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/4)                                                   
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/4)----[0m[0m     0 B/14.89 KiB                     [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/4)[2m[0m[0m 14.89 KiB/14.89 KiB                   [1A
[2mslicer    [0m [32m------------------------------[2m[0m[0m 14.89 KiB/14.89 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/4)----[0m[0m     0 B/3.57 MiB                      [2A
[2mslicer    [0m [32m------------------------------[2m[0m[0m 14.89 KiB/14.89 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/4)----[0m[0m 14.91 KiB/3.57 MiB                    [2A
[2mslicer    [0m [32m------------------------------[2m[0m[0

## From Raw Data to Trustworthy Predictions

This project reframes the Titanic survival prediction task as an
**explainable machine learning and model trust problem**, not a
leaderboard optimization exercise.


In [None]:
import pandas as pd

# Hex uploaded files are accessible by filename
train_raw = pd.read_csv("train.csv")

# Quick sanity checks
print("Rows, cols:", train_raw.shape)
train_raw.head()

Rows, cols: (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
y = train_raw["Survived"]
X = train_raw.drop(columns=["Survived"])

In [None]:
# Define a function to perform the data transformations
def transform_data(data):
    
    # Drop 'Name' column
    data.drop(columns=['Name'], inplace=True)
    
    # Convert 'Sex' to 0 for 'male' and 1 for 'female'
    sex_mapping = {'male': 0, 'female': 1}
    data['Sex'] = data['Sex'].map(sex_mapping)

    # Encode Age as a binary "present vs missing" indicator (unchanged logic)
    data['Age'] = data['Age'].fillna(0)
    data['Age'] = data['Age'].apply(lambda x: 1 if x > 0 else x)
    
    # Rename for correct semantics (NO value change)
    data.rename(columns={'Age': 'AgeMissing'}, inplace=True)

    # Create a new feature 'FamSz' by summing 'SibSp' and 'Parch'
    data['FamSz'] = data['SibSp'] + data['Parch']

    # Drop 'Ticket', 'Cabin', 'Fare', 'PassengerId' columns
    data.drop(columns=['Ticket', 'Cabin', 'Fare', 'PassengerId'], inplace=True)

    # Fill missing values in 'Embarked' with mode
    embarked_mode = data['Embarked'].mode()[0]
    data['Embarked'] = data['Embarked'].fillna(embarked_mode)

    # Convert 'Embarked' to numeric values (1, 2, 3)
    embarked_mapping = {'S': 1, 'C': 2, 'Q': 3}
    data['Embarked'] = data['Embarked'].map(embarked_mapping)
    
    return data

In [None]:
y = train_raw["Survived"]
X = transform_data(train_raw.drop(columns=["Survived"]))

In [None]:
feature_names = X.columns.tolist()
print(feature_names)

['Pclass', 'Sex', 'AgeMissing', 'SibSp', 'Parch', 'Embarked', 'FamSz']


In [None]:
print("X shape:", X.shape)
print("Missing values:", X.isnull().sum().sum())
X.head()

X shape: (891, 7)
Missing values: 0


Unnamed: 0,Pclass,Sex,AgeMissing,SibSp,Parch,Embarked,FamSz
0,3,0,1.0,1,0,1,1
1,1,1,1.0,1,0,2,1
2,3,1,1.0,0,0,1,0
3,1,1,1.0,1,0,1,1
4,3,0,1.0,0,0,1,0


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_holdout, y_train, y_holdout = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
from sklearn.preprocessing import StandardScaler

hex_scaler = StandardScaler()
X_train_scaled = hex_scaler.fit_transform(X_train)
X_holdout_scaled = hex_scaler.transform(X_holdout)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [2, 4, 8, 10, 25, 50, 100],
    'max_depth': [None, 2, 3, 4, 8, 16],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6],
    'bootstrap': [True, False]
}

rfc = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=rfc,
    param_grid=param_grid,
    cv=10,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_scaled, y_train)

hex_model = grid_search.best_estimator_

Fitting 10 folds for each of 1008 candidates, totalling 10080 fits


In [None]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 8}

In [None]:
hex_feature_names = X.columns.tolist()

In [None]:
def predict_survival_proba(input_df):
    X_scaled = hex_scaler.transform(input_df[hex_feature_names])
    return hex_model.predict_proba(X_scaled)[:, 1]

In [None]:
import json as _hex_json

input_1 = _hex_json.loads("\"3rd\"")

In [None]:
import json as _hex_json

input_2 = _hex_json.loads("\"Female\"")

In [None]:
import json as _hex_json

input_4 = _hex_json.loads("0")

In [None]:
import json as _hex_json

input_5 = _hex_json.loads("0")

In [None]:
import json as _hex_json

input_6 = _hex_json.loads("\"Southampton\"")

In [None]:
import json as _hex_json

input_9 = _hex_json.loads("\"No\"")

In [None]:
# Create friendly labels
pclass_map = {"1st": 1, "2nd": 2, "3rd": 3}
pclass_value = pclass_map.get(input_1, input_1)

sex_map = {"Male": 0, "Female": 1}
sex_value = sex_map.get(input_2, input_2)

embarked_map = {"Southampton": 1, "Cherbourg": 2, "Queenstown": 3}
embarked_value = embarked_map.get(input_6, input_6)

age_missing_map = {"Yes": 0, "No": 1}
age_missing_value = age_missing_map.get(input_9, input_9)

input_df = pd.DataFrame([{
    "Pclass": pclass_value,
    "Sex": sex_value,
    "AgeMissing": age_missing_value,
    "SibSp": input_4,
    "Parch": input_5,
    "FamSz": input_4 + input_5,
    "Embarked": embarked_value
}])

survival_proba = predict_survival_proba(input_df)[0]

In [None]:
survival_proba

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Recreate the analysis split
X_train_a, X_explain, y_train_a, y_explain = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale using the SAME scaler
X_train_a_scaled = hex_scaler.transform(X_train_a)

# Train analysis model with best params
analysis_model = RandomForestClassifier(
    **grid_search.best_params_,
    random_state=42
)

analysis_model.fit(X_train_a_scaled, y_train_a)

In [None]:
import shap

# Build the explainer ONCE per kernel (fast path for tree models)
shap_explainer = shap.TreeExplainer(
    analysis_model,
    feature_perturbation="tree_path_dependent"
)

In [None]:
# Scale the current app input
input_scaled = hex_scaler.transform(input_df[hex_feature_names])

# Compute SHAP values for this single observation
shap_values = shap_explainer.shap_values(input_scaled)

# For binary classification, take class 1 (survival)
# Handle different SHAP output formats
if isinstance(shap_values, list):
    shap_row = shap_values[1][0]
else:
    # For single output, ensure it's 1D
    if shap_values.ndim == 2:
        shap_row = shap_values[0]
    else:
        shap_row = shap_values

In [None]:
import pandas as pd
import numpy as np

# Ensure shap_row is 1D
shap_values_1d = np.atleast_1d(shap_row).flatten()

# Match feature names to SHAP values (should be same length)
n_features = min(len(hex_feature_names), len(shap_values_1d))

explanation_df = pd.DataFrame({
    "Feature": list(hex_feature_names[:n_features]),
    "Contribution": list(shap_values_1d[:n_features])
})

# Sort by absolute impact
explanation_df["AbsContribution"] = explanation_df["Contribution"].abs()
explanation_df = explanation_df.sort_values("AbsContribution", ascending=False).head(5)

# Drop helper column and reset index
explanation_df = explanation_df[["Feature", "Contribution"]].reset_index(drop=True)

explanation_df

Unnamed: 0,Feature,Contribution
0,SibSp,0.278868
1,AgeMissing,-0.278868
2,Pclass,0.085868
3,Sex,-0.085868
4,Embarked,-0.009093


In [None]:
import pandas as pd

feature_labels = {
    "Pclass": "Passenger class",
    "Sex": "Sex",
    "AgeMissing": "Age recorded?",
    "SibSp": "Siblings / spouses",
    "Parch": "Parents / children",
    "FamSz": "Family size",
    "Embarked": "Port"
}

# Create copy and apply friendly labels
explanation_df = explanation_df.copy()
explanation_df["Feature"] = explanation_df["Feature"].map(feature_labels).fillna(explanation_df["Feature"])

explanation_df

Unnamed: 0,Feature,Contribution
0,Siblings / spouses,0.278868
1,Age recorded?,-0.278868
2,Passenger class,0.085868
3,Sex,-0.085868
4,Port,-0.009093


This table shows the factors that most influenced the model’s prediction
for the selected passenger. Positive values increase the predicted chance
of survival; negative values decrease it.

In [None]:
explanation_df

This section explores how the model behaves across different passenger groups and highlights known limitations. The goal is transparency, not optimization.

In [None]:
from sklearn.metrics import accuracy_score

# Generate predictions on holdout set
y_holdout_pred = hex_model.predict(X_holdout_scaled)

# Calculate accuracy
overall_accuracy = accuracy_score(y_holdout, y_holdout_pred)
overall_accuracy

0.7932960893854749

In [None]:
overall_accuracy

Holdout Accuracy is measured on a "held-out" validation set.

In [None]:
import pandas as pd

# Build evaluation dataframe
eval_df = X_holdout.copy()
eval_df["y_true"] = y_holdout
eval_df["y_pred"] = y_holdout_pred

# Add human-readable fields
eval_df["Sex"] = eval_df["Sex"].map({0: "Male", 1: "Female"})
eval_df["Passenger class"] = eval_df["Pclass"].map({
    1: "1st class",
    2: "2nd class",
    3: "3rd class"
})

# Error indicator
eval_df["Error"] = eval_df["y_true"] != eval_df["y_pred"]

# Cohort error rates
cohort_errors = (
    eval_df
    .groupby(["Sex", "Passenger class"])
    .agg(
        Samples=("Error", "count"),
        ErrorRate=("Error", "mean")
    )
    .reset_index()
)

cohort_errors

Unnamed: 0,Sex,Passenger class,Samples,ErrorRate
0,Female,1st class,15,0.2
1,Female,2nd class,18,0.055556
2,Female,3rd class,28,0.321429
3,Male,1st class,30,0.366667
4,Male,2nd class,16,0.1875
5,Male,3rd class,72,0.138889


In [None]:
cohort_errors

**Observed limitations**

The model shows higher error rates for some male passengers in lower ticket classes. Predictions rely on historical patterns from the Titanic dataset and may reflect historical bias.


**What this shows**

Error rates are not uniform across passenger groups. Some combinations of
sex and ticket class are more difficult for the model to predict accurately,
often due to smaller sample sizes or overlapping feature patterns.

All error rates are computed by comparing model predictions to known survival outcomes in the Titanic dataset.


In [None]:
import json as _hex_json

input_3 = _hex_json.loads("true")

**Ground truth and evaluation**

Ground truth labels are used here only to evaluate model behavior across groups—not to influence individual predictions.

In [None]:
import pandas as pd

# --- Ground truth composition by cohort ---

if input_3:
    gt_composition = (
        eval_df
        .groupby(["Sex", "Passenger class"])
        .agg(
            TotalPassengers=("y_true", "count"),
            Survivors=("y_true", "sum"),
        )
        .reset_index()
    )

    gt_composition["Non-survivors"] = (
        gt_composition["TotalPassengers"] - gt_composition["Survivors"]
    )

    gt_composition["% Survived"] = (
        gt_composition["Survivors"] / gt_composition["TotalPassengers"]
    )
else:
    gt_composition = pd.DataFrame(columns=["Sex", "Passenger class", "TotalPassengers", "Survivors", "Non-survivors", "% Survived"])

gt_composition

Unnamed: 0,Sex,Passenger class,TotalPassengers,Survivors,Non-survivors,% Survived
0,Female,1st class,15,14,1,0.933333
1,Female,2nd class,18,17,1,0.944444
2,Female,3rd class,28,14,14,0.5
3,Male,1st class,30,11,19,0.366667
4,Male,2nd class,16,3,13,0.1875
5,Male,3rd class,72,10,62,0.138889


In [None]:
gt_composition

**Prediction Error Rates**

*The original Titanic dataset has missing age values for some passengers. This indicates whether the passenger’s age __was recorded__, not their actual age.

An interactive prediction of survival probability for a hypothetical passenger, based on selected passenger characteristics.

The features that most strongly increased or decreased the predicted survival probability for the selected passenger.

Overall model performance and error patterns across the full dataset (not tied to the current prediction), broken down by key demographic and socioeconomic groups. This section explores how the model behaves across different passenger groups and highlights known limitations. The goal is transparency, not optimization.