In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_probability as tfp

from hyperopt import hp, space_eval
from xgboost import plot_importance, plot_tree
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer, SplineTransformer, RobustScaler, MaxAbsScaler, MinMaxScaler, QuantileTransformer

from tree_models import run_ada_boost_classifier, run_random_forest_classifier, run_xgboost_classifier, run_xgboost_classifier_search_cv, run_xgboost_classifier_hyperopt, create_xgboost_classifier
from preprocessing import process_dataset, get_where_all_are_non_null, expand_cabin, expand_name, expand_passenger_id_to_group, impute_missing_values, fill_age_column, fill_missing_categorical_columns

sns.set_theme()

DATASET_PATH = "/Users/dincaus/datasets/spaceship-titanic"

TRAIN_PATH = os.path.join(DATASET_PATH, "train.csv")
TEST_PATH = os.path.join(DATASET_PATH, "test.csv")

In [3]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

In [4]:
all_data = pd.concat([train_df, test_df], axis=0)

all_data[["Deck", "CabinNumber", "Side"]] = expand_cabin(all_data)
all_data["PassengerGroup"] = expand_passenger_id_to_group(all_data)
all_data[["FirstName", "LastName"]] = expand_name(all_data)

all_data = all_data.drop(columns=["Name", "Cabin"])

In [5]:
last_name_count_mapping = pd.concat([all_data.groupby(by="LastName")["LastName"].count(), pd.Series([1], index=[np.nan])])
all_data["FamilySize"] = all_data["LastName"].apply(lambda x: last_name_count_mapping[x])

# fill missing values

# categorical
all_data["HomePlanet"] = fill_missing_categorical_columns(all_data, "HomePlanet")
all_data["Destination"] = fill_missing_categorical_columns(all_data, "Destination")
all_data["Deck"] = fill_missing_categorical_columns(all_data, "Deck")
all_data["Side"] = fill_missing_categorical_columns(all_data, "Side")
all_data["VIP"] = fill_missing_categorical_columns(all_data, "VIP")
all_data["CryoSleep"] = fill_missing_categorical_columns(all_data, "CryoSleep")

# numerical
fill_age_column(all_data)
all_data[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]] = impute_missing_values(all_data, ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"], n_neighbors=4)

In [6]:
CATEGORICAL_COLUMNS = ["HomePlanet", "Destination", "Deck", "Side", "PassengerGroup"]
NUMERICAL_COLUMNS = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "CabinNumber", "FamilySize"]
BOOLEAN_COLUMNS = ["VIP", "CryoSleep", ]

In [7]:
all_data["Age"] = RobustScaler().fit_transform(all_data["Age"].to_numpy().reshape(-1, 1))
all_data["RoomService"] = RobustScaler().fit_transform(all_data["RoomService"].to_numpy().reshape(-1, 1))
all_data["FoodCourt"] = RobustScaler().fit_transform(all_data["FoodCourt"].to_numpy().reshape(-1, 1))
all_data["ShoppingMall"] = RobustScaler().fit_transform(all_data["ShoppingMall"].to_numpy().reshape(-1, 1))
all_data["Spa"] = RobustScaler().fit_transform(all_data["Spa"].to_numpy().reshape(-1, 1))
all_data["VRDeck"] = RobustScaler().fit_transform(all_data["VRDeck"].to_numpy().reshape(-1, 1))
all_data["CabinNumber"] = RobustScaler().fit_transform(all_data["CabinNumber"].to_numpy().reshape(-1, 1))
all_data["FamilySize"] = RobustScaler().fit_transform(all_data["FamilySize"].to_numpy().reshape(-1, 1))

categorical_encoders = {
    cat: LabelEncoder()
    for cat in CATEGORICAL_COLUMNS
}

for col in categorical_encoders:
    all_data[col] = categorical_encoders[col].fit_transform(all_data[col])

for col in BOOLEAN_COLUMNS:
    all_data[col] = all_data[col].astype(np.int32)

In [8]:
# create inputs
FEATURES = [
    'HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP',
    'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
    'Deck', 'Side', 'FamilySize'
]
LABELS = "Transported"

# numerical
age_input = tf.keras.layers.Input(shape=(1, ), name="age")
family_size_input = tf.keras.layers.Input(shape=(1, ))
# cabin_number_input = tf.keras.layers.Input(shape=(1, ))
room_service_input = tf.keras.layers.Input(shape=(1, ))
food_court_input = tf.keras.layers.Input(shape=(1, ))
shopping_mall_input = tf.keras.layers.Input(shape=(1, ))
spa_input = tf.keras.layers.Input(shape=(1, ))
vrdeck_input = tf.keras.layers.Input(shape=(1, ))
vip_input = tf.keras.layers.Input(shape=(1, ))
cryosleep_input = tf.keras.layers.Input(shape=(1, ))

home_planet_input = tf.keras.layers.Input(shape=(3, ))
destination_input = tf.keras.layers.Input(shape=(3, ))
deck_input = tf.keras.layers.Input(shape=(8, ))
side_input = tf.keras.layers.Input(shape=(2, ))

features = tf.keras.layers.concatenate([
    age_input,
    family_size_input,
    # cabin_number_input,
    room_service_input,
    food_court_input,
    shopping_mall_input,
    spa_input,
    vrdeck_input,
    vip_input,
    cryosleep_input,
    home_planet_input,
    destination_input,
    deck_input,
    side_input
])
x = tf.keras.layers.Dense(128, activation="relu")(features)
x = tf.keras.layers.Dense(64, activation="relu")(x)
x = tf.keras.layers.Dropout(0.5)(x)

output = tf.keras.layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(inputs=features, outputs=output)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

2022-08-30 16:26:34.546776: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
train_x, train_y = all_data.loc[~all_data["Transported"].isna()][FEATURES], all_data[~all_data["Transported"].isna()][LABELS]

train_np = []
for feat in FEATURES:
    if feat in CATEGORICAL_COLUMNS:
        result = tf.keras.utils.to_categorical(train_x[feat])
    elif feat in NUMERICAL_COLUMNS or feat in BOOLEAN_COLUMNS:
        result = train_x[feat]
    else:
        print(f"Unknown feature column {feat}")
        continue

    train_np.append(result)

train_x = np.column_stack(train_np)
train_y = train_y.astype(np.int32).to_numpy()

In [10]:
skf = StratifiedKFold(n_splits=5, shuffle=True)
evaluate_results = []

for train_idx, test_idx in skf.split(train_x, train_y):
    x_train, x_test = train_x[train_idx], train_x[test_idx]
    y_train, y_test = train_y[train_idx], train_y[test_idx]

    model.fit(x_train, y_train, epochs=200)

    evaluate_result = model.evaluate(x_test, y_test)
    evaluate_results.append(evaluate_result[1])

print(evaluate_results)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [11]:
test_data_df, test_passenger_ids = all_data.loc[all_data["Transported"].isna()][FEATURES], all_data.loc[all_data["Transported"].isna()]["PassengerId"]

In [12]:
test_np = []
for feat in FEATURES:
    if feat in CATEGORICAL_COLUMNS:
        result = tf.keras.utils.to_categorical(test_data_df[feat])
    elif feat in NUMERICAL_COLUMNS or feat in BOOLEAN_COLUMNS:
        result = test_data_df[feat]
    else:
        print(f"Unknown feature column {feat}")
        continue

    test_np.append(result)

test_x = np.column_stack(test_np)

In [13]:
y_predictions = 1 * (model.predict(test_x) >= 0.5)
test_results = pd.concat([test_passenger_ids, pd.DataFrame(y_predictions, columns=["Transported"])], axis=1)
test_results["Transported"] = test_results["Transported"].astype(bool)
test_results.to_csv("submissionNN.csv", index=False)

