## imports and data loading

In [2]:
import os
!pip install --upgrade numpy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
!pip3 install catboost

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score,
)

import lightgbm as lgb
from catboost import CatBoostClassifier

# basic settings
plt.rcParams["figure.figsize"] = (8, 5)
plt.rcParams["axes.grid"] = True

def set_seed(seed: int = 42):
    np.random.seed(seed)

set_seed(42)

# try to find cleaned data (v2 first, then v1)
POSSIBLE_DIRS = [
    "from_kaggle/cleaned_data",
    "/kaggle/working",
    "/kaggle/input",
    "/mnt/data",
]

def find_clean_paths():
    train_candidates = []
    test_candidates = []
    for d in POSSIBLE_DIRS:
        for fname in ["train_clean_v2.csv", "train_clean.csv"]:
            p = os.path.join(d, fname)
            if os.path.exists(p):
                train_candidates.append(p)
        for fname in ["test_clean_v2.csv", "test_clean.csv"]:
            p = os.path.join(d, fname)
            if os.path.exists(p):
                test_candidates.append(p)
    if not train_candidates or not test_candidates:
        raise FileNotFoundError("could not find any train_clean*.csv / test_clean*.csv files")
    # use v2 if present
    train_path = next((p for p in train_candidates if "v2" in p), train_candidates[0])
    test_path = next((p for p in test_candidates if "v2" in p), test_candidates[0])
    return train_path, test_path

train_clean_path, test_clean_path = find_clean_paths()
print("using train_clean:", train_clean_path)
print("using test_clean :", test_clean_path)

df_train = pd.read_csv(train_clean_path)
df_test = pd.read_csv(test_clean_path)
print("train_clean shape:", df_train.shape)
print("test_clean shape :", df_test.shape)

target_col = "Transported"

# ensure target is 0/1 numeric
if df_train[target_col].dtype == bool:
    df_train[target_col] = df_train[target_col].astype(int)
elif df_train[target_col].dtype == object:
    mapping = {"True": 1, "False": 0, "true": 1, "false": 0}
    df_train[target_col] = df_train[target_col].map(mapping).astype(int)
else:
    df_train[target_col] = df_train[target_col].astype(int)

feature_cols = [c for c in df_train.columns if c != target_col]
X = df_train[feature_cols].values
y = df_train[target_col].values

X_test = df_test[feature_cols].values

print("number of features:", len(feature_cols))


Collecting numpy
  Downloading numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.22.4
    Uninstalling numpy-1.22.4:
      Successfully uninstalled numpy-1.22.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
daal4py 2021.5.0 requires daal==2021.4.0, which is not installed.
numba 0.55.1 requires numpy<1.22,>=1.18, but you have numpy 1.24.4 which is incompatible.
tensorflow 2.13.0 requires keras<2.14,>=2.13.1, but you have keras 2.12.0 which is incompatible.
tensorflow 2.13.0 requires numpy<=1.24.3,>=1.22, but you have numpy 1.24.4

ImportError: numpy.core.multiarray failed to import

## train validation split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

X_train.shape, X_val.shape


## helper functions

In [None]:
def plot_confusion_matrix(cm, class_names=("not transported", "transported"), title="confusion matrix"):
    plt.figure(figsize=(4, 4))
    im = plt.imshow(cm, interpolation="nearest", cmap="Blues")
    plt.title(title)
    plt.colorbar(im, fraction=0.046, pad=0.04)
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=45)
    plt.yticks(tick_marks, class_names)

    thresh = cm.max() / 2.0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(
                j,
                i,
                format(cm[i, j], "d"),
                ha="center",
                va="center",
                color="white" if cm[i, j] > thresh else "black",
            )
    plt.ylabel("true label")
    plt.xlabel("predicted label")
    plt.tight_layout()
    plt.show()


## lightgbm model

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

params_lgb = {
    "objective": "binary",
    "metric": "binary_logloss",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "min_data_in_leaf": 20,
    "verbosity": -1,
    "seed": 42,
}

evals_result_lgb = {}

callbacks_lgb = [
    lgb.early_stopping(stopping_rounds=100),
    lgb.log_evaluation(period=50),
    lgb.record_evaluation(evals_result_lgb),
]

model_lgb = lgb.train(
    params_lgb,
    train_data,
    num_boost_round=2000,
    valid_sets=[train_data, val_data],
    valid_names=["train", "valid"],
    callbacks=callbacks_lgb,
)

print("lightgbm best iteration:", model_lgb.best_iteration)


## Lightgbm validation performance

In [3]:

# validation predictions
val_probs_lgb = model_lgb.predict(X_val, num_iteration=model_lgb.best_iteration)
val_preds_lgb = (val_probs_lgb >= 0.5).astype(int)

cm_lgb = confusion_matrix(y_val, val_preds_lgb)
print("lightgbm confusion matrix:\n", cm_lgb)
print("\nlightgbm classification report:\n", classification_report(y_val, val_preds_lgb, digits=4))
print("lightgbm val auc:", roc_auc_score(y_val, val_probs_lgb))

plot_confusion_matrix(cm_lgb, title="lightgbm validation confusion matrix")


NameError: name 'model_lgb' is not defined

catboost model

In [None]:

# all features are numeric in the cleaned data, so no need to mark cat features
model_cb = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="Logloss",
    depth=8,
    learning_rate=0.05,
    l2_leaf_reg=3.0,
    iterations=3000,
    random_seed=42,
    od_type="Iter",
    od_wait=80,
    verbose=200,
)

model_cb.fit(
    X_train,
    y_train,
    eval_set=(X_val, y_val),
    use_best_model=True,
)


catboost validation performance

In [None]:

val_probs_cb = model_cb.predict_proba(X_val)[:, 1]
val_preds_cb = (val_probs_cb >= 0.5).astype(int)

cm_cb = confusion_matrix(y_val, val_preds_cb)
print("catboost confusion matrix:\n", cm_cb)
print("\ncatboost classification report:\n", classification_report(y_val, val_preds_cb, digits=4))
print("catboost val auc:", roc_auc_score(y_val, val_probs_cb))

plot_confusion_matrix(cm_cb, title="catboost validation confusion matrix")


ensemble validation performance

In [None]:

# simple average ensemble of lgbm and catboost probabilities
val_probs_ens = 0.5 * val_probs_lgb + 0.5 * val_probs_cb
val_preds_ens = (val_probs_ens >= 0.5).astype(int)

cm_ens = confusion_matrix(y_val, val_preds_ens)
print("ensemble confusion matrix:\n", cm_ens)
print("\nensemble classification report:\n", classification_report(y_val, val_preds_ens, digits=4))
print("ensemble val auc:", roc_auc_score(y_val, val_probs_ens))

plot_confusion_matrix(cm_ens, title="ensemble validation confusion matrix")


test predictions and submission

In [None]:

# predict on test features for each model
test_probs_lgb = model_lgb.predict(X_test, num_iteration=model_lgb.best_iteration)
test_probs_cb = model_cb.predict_proba(X_test)[:, 1]

test_probs_ens = 0.5 * test_probs_lgb + 0.5 * test_probs_cb
test_preds_ens = (test_probs_ens >= 0.5).astype(int)

# load original test.csv to get passenger ids in correct format
RAW_DIRS = [
    "/kaggle/input/spaceship-titanic",
    ".",
    "/kaggle/working",
    "/kaggle/input",
    "/mnt/data",
]

def find_raw_test_path():
    for d in RAW_DIRS:
        p = os.path.join(d, "test.csv")
        if os.path.exists(p):
            return p
    raise FileNotFoundError("could not find raw test.csv for passenger ids")

raw_test_path = find_raw_test_path()
raw_test = pd.read_csv(raw_test_path)

submission = pd.DataFrame({
    "PassengerId": raw_test["PassengerId"].astype(str),
    "Transported": test_preds_ens.astype(bool),
})

submission_path = "ensemble_lgbm_catboost_submission.csv"
submission.to_csv(submission_path, index=False)
print("saved ensemble submission to:", submission_path)
submission.head()
