Baseline model with engineered features, scaling, no threshold tuning & __no__ Title feature

In [None]:
# ============================================================
# Baseline Model Using Engineered Features + One-Hot Encoding + Scaling
# ============================================================

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_curve,
    roc_auc_score,
    precision_recall_curve
)
import json

# ============================================================
# 1. Load processed datasets
# ============================================================

train = pd.read_csv("../data/processed_v3/train_features.csv")
test = pd.read_csv("../data/processed_v3/test_features.csv")

# Remove PassengerId from features but keep for submission
test_passenger_ids = test["PassengerId"]

# ============================================================
# 2. Identify feature columns
# ============================================================

y = train["Survived"]
X = train.drop(columns=["Survived", "PassengerId"])
X_test_final = test.drop(columns=["PassengerId"])

# ============================================================
# 3. Load categorical metadata
# ============================================================

with open("../data/processed_v3/processed_metadata.json", "r") as f:
    meta = json.load(f)

categorical_cols = list(meta.keys())
numeric_cols = [col for col in X.columns if col not in categorical_cols]

print("Categorical columns:", categorical_cols)
print("Numeric columns:", numeric_cols)

# ============================================================
# 4. Preprocessing: One-Hot + Scaling
# ============================================================

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", StandardScaler(), numeric_cols)
    ]
)

# ============================================================
# 5. Build Logistic Regression pipeline
# ============================================================

model = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("logreg", LogisticRegression(max_iter=5000))
    ]
)

# ============================================================
# 6. Train/validation split
# ============================================================

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ============================================================
# 7. Train baseline model
# ============================================================

model.fit(X_train, y_train)

# Predict class labels and probabilities
y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:, 1]

# ============================================================
# 8. Confusion Matrix
# ============================================================

cm = confusion_matrix(y_val, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix (Validation Set)")
plt.show()

# ============================================================
# 9. ROC Curve
# ============================================================

fpr, tpr, _ = roc_curve(y_val, y_proba)
auc = roc_auc_score(y_val, y_proba)

plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
plt.plot([0, 1], [0, 1], "--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve (Validation Set)")
plt.legend()
plt.show()

# ============================================================
# 10. Precision–Recall Curve
# ============================================================

precision, recall, _ = precision_recall_curve(y_val, y_proba)

plt.figure(figsize=(6, 5))
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision–Recall Curve")
plt.show()

# ============================================================
# 11. Train final model & extract feature importance
# ============================================================

model.fit(X, y)

import numpy as np
import pandas as pd

# ============================================================
# Extract feature names after preprocessing
# ============================================================

preprocess = model.named_steps["preprocess"]

# One-Hot Encoder
ohe = preprocess.named_transformers_["cat"]
ohe_features = ohe.get_feature_names_out(categorical_cols)

# Scaled numeric feature names (same order as input)
numeric_features = numeric_cols

# Combined feature list
all_features = np.concatenate([ohe_features, numeric_features])

# ============================================================
# Extract Logistic Regression coefficients
# ============================================================

logreg = model.named_steps["logreg"]
coefs = logreg.coef_.ravel()

coef_df = (
    pd.DataFrame({
        "feature": all_features,
        "coef": coefs,
        "abs_coef": np.abs(coefs)
    })
    .sort_values("abs_coef", ascending=False)
    .reset_index(drop=True)
)

# Show the strongest + weakest signals
print("Top 15 features:\n", coef_df.head(15), "\n")
print("Bottom 15 features:\n", coef_df.tail(15), "\n")

# Optionally save to CSV
coef_df.to_csv("../submissions/feature_importances_lr.csv", index=False)

# ============================================================
# 12. Predict test set & create submission
# ============================================================

test_predictions = model.predict(X_test_final)

submission = pd.DataFrame({
    "PassengerId": test_passenger_ids,
    "Survived": test_predictions
})

submission_path = "../submissions/submission_engineered_baseline_scaled.csv"
submission.to_csv(submission_path, index=False)

print(f"Ready for Kaggle: {submission_path}")
print("Feature coefficients saved to: ../submissions/feature_importances_lr.csv")