In [1]:
%load_ext autoreload
%autoreload 2

In [39]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import category_encoders as ce

from neural import run_wide_and_deep_model, run_deep_and_cross_model, run_grn_and_vsn_model, run_tabnet_model, run_tabnet_model_hyperopt

from hyperopt import hp, space_eval
from sklearn.experimental import enable_iterative_imputer  # noqa
from outliers import detect_outliers_isolation, detect_outliers_elliptic
from sklearn.impute import IterativeImputer, SimpleImputer, KNNImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer, SplineTransformer, RobustScaler, MaxAbsScaler, MinMaxScaler, QuantileTransformer, Normalizer
from preprocessing import process_dataset, get_where_all_are_non_null, expand_cabin, expand_name, expand_passenger_id_to_group, impute_missing_values, fill_age_column, fill_missing_categorical_columns

sns.set_theme()

DATASET_PATH = "/Users/dincaus/datasets/spaceship-titanic"

TRAIN_PATH = os.path.join(DATASET_PATH, "train.csv")
TEST_PATH = os.path.join(DATASET_PATH, "test.csv")

In [3]:
print(tf.config.list_physical_devices())
# tf.config.set_visible_devices([], 'GPU')

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

In [51]:
all_data = pd.concat([train_df, test_df], axis=0)

all_data[["Deck", "CabinNumber", "Side"]] = expand_cabin(all_data)
all_data["PassengerGroup"] = expand_passenger_id_to_group(all_data)
all_data[["FirstName", "LastName"]] = expand_name(all_data)

all_data = all_data.drop(columns=["Name", "Cabin"]).reset_index(drop=True)

In [52]:
last_name_mapping = pd.concat([all_data.groupby(by="LastName")["LastName"].count().rename("Count"), pd.Series([all_data.groupby(by="LastName")["LastName"].count().rename("Count").mode()[0]], index=[np.nan])])

all_data["FamilySize"] = all_data["LastName"].apply(lambda x: last_name_mapping[x])

all_data = all_data.drop(columns= ["FirstName", "LastName"])
all_data["HasMissing"] = all_data.isna().any(axis=1)
all_data["NumberMissing"] = all_data.isnull().sum(axis=1)

In [53]:
NUMERICAL_FEATURES = ["Age", "Spa", "VRDeck", "RoomService", "FoodCourt", "ShoppingMall", ]
BOOL_FEATURES = ["VIP", "CryoSleep", ]
CATEGORICAL_FEATURES = ["Deck", "Side", "HomePlanet", "Destination", ]

LABEL_ENCODERS = {}

for cat_col in BOOL_FEATURES + CATEGORICAL_FEATURES:

    if cat_col in BOOL_FEATURES:
        all_data[cat_col] = SimpleImputer(strategy="most_frequent").fit_transform(all_data[cat_col].to_numpy().reshape(-1, 1))
    else:
        all_data[cat_col] = SimpleImputer(strategy="constant", fill_value="missing").fit_transform(all_data[cat_col].to_numpy().reshape(-1, 1))

    if cat_col in CATEGORICAL_FEATURES:
        LABEL_ENCODERS[cat_col] = LabelEncoder()
        all_data[cat_col] = LABEL_ENCODERS[cat_col].fit_transform(all_data[cat_col].to_numpy())

In [54]:
# preprocess variables
for cat_feat in BOOL_FEATURES:
    all_data[cat_feat] = all_data[cat_feat].astype(np.int64)

for num_feat in NUMERICAL_FEATURES:
    all_data[num_feat] = SimpleImputer(strategy="mean").fit_transform(all_data[num_feat].to_numpy().reshape(-1, 1))
    all_data[num_feat] = StandardScaler().fit_transform(all_data[num_feat].to_numpy().reshape(-1, 1))

In [55]:
train_data = all_data.loc[(~all_data["Transported"].isna()) & (all_data["NumberMissing"] <= 5)]
train_data["Transported"] = train_data["Transported"].astype(np.int32)

train_data = ce.WOEEncoder(cols=["HomePlanet", "Destination", "Deck", "Side"]).fit_transform(train_data, train_data["Transported"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data["Transported"] = train_data["Transported"].astype(np.int32)


In [67]:
with tf.device('/cpu:0'):
    # spaceship_predictor, [history, evaluation_results, accuracy_score, roc_auc_score] = run_tabnet_model(
    #     train_df=train_data,
    #     feature_columns=NUMERICAL_FEATURES + CATEGORICAL_FEATURES + BOOL_FEATURES,
    #     label_cols=["Transported"],
    #     feature_dim=128,
    #     output_dim=128,
    #     n_step=2,
    #     n_shared=2,
    #     relaxation_factor=2.2,
    #     sparsity_coefficient=2.37e-07,
    #     bn_momentum=0.9245,
    #     learning_rate=1e-3,
    #     epochs=1_000,
    #     batch_size=256,
    #     test_size=0.1,
    #     shuffle=True
    # )
    # print(f"Accuracy: {accuracy_score}")
    # print(f"ROC Auc: {roc_auc_score}")

    search_space_hyperopt = {
        "feature_dim": hp.choice("feature_dim", [32, 64, 128, 256, 512]),
        "n_step": hp.choice("n_step", range(2, 9, 1)),
        "relaxation_factor": hp.choice("relaxation_factor", np.arange(1., 3., 0.1)),
        "sparsity_coefficient": hp.uniform("sparsity_coefficient", 0.00000001, 0.1),
        "n_shared": hp.choice("n_shared", np.arange(0, 4, 1)),
        "bn_momentum": hp.uniform("bn_momentum", 0.9, 0.9999),
        "learning_rate": 1e-3
    }
    best_params, train_data = run_tabnet_model_hyperopt(
        train_df=train_data,
        feature_columns=NUMERICAL_FEATURES + CATEGORICAL_FEATURES + BOOL_FEATURES,
        label_cols=["Transported"],
        search_space_params=search_space_hyperopt,
        epochs=5,
        batch_size=256,
        number_iterations=50,
        test_size=0.1,
        shuffle=True,
        verbose=2
    )

    best_params_eval = space_eval(search_space_hyperopt, best_params)
    print(f"Best Tabnet parameters found via hyperopt: {best_params_eval}")

31/31 - 70s - loss: 0.7081 - output_1_loss: 0.6985 - val_loss: 0.6759 - val_output_1_loss: 0.6568 - 70s/epoch - 2s/step

Epoch 2/5                                                                            

31/31 - 46s - loss: 0.5936 - output_1_loss: 0.5832 - val_loss: 0.5935 - val_output_1_loss: 0.5772 - 46s/epoch - 1s/step

Epoch 3/5                                                                            

31/31 - 49s - loss: 0.5402 - output_1_loss: 0.5303 - val_loss: 0.5155 - val_output_1_loss: 0.5039 - 49s/epoch - 2s/step

Epoch 4/5                                                                            

31/31 - 46s - loss: 0.5272 - output_1_loss: 0.5173 - val_loss: 0.5100 - val_output_1_loss: 0.4997 - 46s/epoch - 1s/step

Epoch 5/5                                                                            

31/31 - 43s - loss: 0.5323 - output_1_loss: 0.5221 - val_loss: 0.4982 - val_output_1_loss: 0.4871 - 43s/epoch - 1s/step

 1/28 [>.............................] - ETA: 1

Traceback (most recent call last):
  File "/Users/dincaus/miniconda3/envs/tf_metal_acc/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3398, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/jy/0x3dtb595998ft_1332l_k780000gn/T/ipykernel_74275/476633272.py", line 31, in <cell line: 1>
    best_params, train_data = run_tabnet_model_hyperopt(
  File "/Users/dincaus/DataspellProjects/tf_keras_2_0/spaceship-titanic/neural.py", line 883, in run_tabnet_model_hyperopt
    best = fmin(objective, search_space_params, algo=tpe.suggest, max_evals=number_iterations)
  File "/Users/dincaus/miniconda3/envs/tf_metal_acc/lib/python3.8/site-packages/hyperopt/fmin.py", line 586, in fmin
    rval.exhaust()
  File "/Users/dincaus/miniconda3/envs/tf_metal_acc/lib/python3.8/site-packages/hyperopt/fmin.py", line 364, in exhaust
    self.run(self.max_evals - n_done, block_until_done=self.asynchronous)
  File "/Users/dincaus/miniconda3/envs/tf_metal_acc/l

In [None]:
best_params_eval = {
    'bn_momentum': 0.94,
    'feature_dim': 256,
    'learning_rate': 0.01,
    'n_shared': 2,
    'n_step': 3,
    'relaxation_factor': 0.9,
    'sparsity_coefficient': 1e-5
}
best_params_eval["output_dim"] = best_params_eval["feature_dim"]

with tf.device('/cpu:0'):
    spaceship_predictor, [history, evaluation_results, accuracy_score, roc_auc_score] = run_tabnet_model(
        train_df=train_data,
        feature_columns=NUMERICAL_FEATURES + CATEGORICAL_FEATURES + BOOL_FEATURES,
        label_cols=["Transported"],
        **best_params_eval,
        # learning_rate=1e-3,
        epochs=1_000,
        batch_size=256,
        test_size=0.1,
        shuffle=True
    )

    print(f"Accuracy: {accuracy_score}")
    print(f"ROC Auc: {roc_auc_score}")

In [64]:
test_data = all_data[all_data["Transported"].isna()]

test_predictions, _ = spaceship_predictor.predict(test_data[NUMERICAL_FEATURES + CATEGORICAL_FEATURES + BOOL_FEATURES].to_numpy())
y_predictions = test_predictions.argmax(axis=-1)

test_results = pd.concat([test_data["PassengerId"].reset_index(drop=True), pd.DataFrame(y_predictions, columns=["Transported"])], axis=1)
test_results["Transported"] = test_results["Transported"].astype(bool)
test_results.to_csv("submissionNN.csv", index=False)

2022-09-14 12:05:05.198331: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




In [65]:
!kaggle competitions submit -c spaceship-titanic -f submissionNN.csv -m "second submit"

100%|██████████████████████████████████████| 55.4k/55.4k [00:01<00:00, 30.8kB/s]
Successfully submitted to Spaceship Titanic