In [1]:
%load_ext autoreload
%autoreload 2

In [362]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
import tensorflow as tf
import autokeras as ak
import matplotlib.pyplot as plt

from dataset import dataframe_to_dataset
from neural import run_wide_and_deep_model, run_deep_and_cross_model, run_grn_and_vsn_model, run_tabnet_model

from sklearn.experimental import enable_iterative_imputer  # noqa
from outliers import detect_outliers_isolation, detect_outliers_elliptic
from sklearn.impute import IterativeImputer, SimpleImputer, KNNImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer, SplineTransformer, RobustScaler, MaxAbsScaler, MinMaxScaler, QuantileTransformer, Normalizer
from preprocessing import process_dataset, get_where_all_are_non_null, expand_cabin, expand_name, expand_passenger_id_to_group, impute_missing_values, fill_age_column, fill_missing_categorical_columns

sns.set_theme()

DATASET_PATH = "/Users/dincaus/datasets/spaceship-titanic"

TRAIN_PATH = os.path.join(DATASET_PATH, "train.csv")
TEST_PATH = os.path.join(DATASET_PATH, "test.csv")

In [3]:
print(tf.config.list_physical_devices())
tf.config.set_visible_devices([], 'GPU')

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [186]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

In [333]:
all_data = pd.concat([train_df, test_df], axis=0)

all_data[["Deck", "CabinNumber", "Side"]] = expand_cabin(all_data)
all_data["PassengerGroup"] = expand_passenger_id_to_group(all_data)
all_data[["FirstName", "LastName"]] = expand_name(all_data)

all_data = all_data.drop(columns=["Name", "Cabin"]).reset_index(drop=True)

In [334]:
last_name_mapping = pd.concat([all_data.groupby(by="LastName")["LastName"].count().rename("Count"), pd.Series([all_data.groupby(by="LastName")["LastName"].count().rename("Count").mode()[0]], index=[np.nan])])

all_data["FamilySize"] = all_data["LastName"].apply(lambda x: last_name_mapping[x])

all_data = all_data.drop(columns= ["FirstName", "LastName"])
all_data["HasMissing"] = all_data.isna().any(axis=1)
all_data["NumberMissing"] = all_data.isnull().sum(axis=1)

In [335]:
NUMERICAL_FEATURES = ["Age", "Spa", "VRDeck", "RoomService", "FoodCourt", "ShoppingMall", ]
BOOL_FEATURES = ["VIP", "CryoSleep", ]
CATEGORICAL_FEATURES = ["Deck", "Side", "HomePlanet", "Destination", ]
CATEGORICAL_FEATURES_INT = ["CabinNumber", ]

all_data[NUMERICAL_FEATURES] = KNNImputer(n_neighbors=301, weights="distance").fit_transform(all_data[NUMERICAL_FEATURES].to_numpy())

for cat_col in BOOL_FEATURES + CATEGORICAL_FEATURES:
    all_data[cat_col] = SimpleImputer(strategy="most_frequent").fit_transform(all_data[cat_col].to_numpy().reshape(-1, 1))

In [336]:
# preprocess variables
for cat_feat in BOOL_FEATURES:
    all_data[cat_feat] = all_data[cat_feat].astype(np.int64)

for num_feat in NUMERICAL_FEATURES:
    all_data[num_feat] = MinMaxScaler().fit_transform(np.log1p(all_data[num_feat].to_numpy()).reshape(-1, 1))

In [337]:
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "HomePlanet": list(all_data["HomePlanet"].unique()),
    "Destination": list(all_data["Destination"].unique()),
    "Deck": list(all_data["Deck"].unique()),
    "Side": list(all_data["Side"].unique()),
    "CabinNumber": list(all_data["CabinNumber"].unique())
}

In [344]:
train_data = all_data.loc[(~all_data["Transported"].isna()) & (all_data["NumberMissing"] <= 4)]
train_data["Transported"] = train_data["Transported"].astype(np.int32)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data["Transported"] = train_data["Transported"].astype(np.int32)


In [350]:
# spaceship_predictor, [history, evaluation_results] = run_grn_and_vsn_model(
#     train_df=train_data,
#     numerical_features=NUMERICAL_FEATURES + BOOL_FEATURES,
#     categorical_features=CATEGORICAL_FEATURES,
#     categorical_features_int=CATEGORICAL_FEATURES_INT,
#     categorical_features_with_vocabulary=CATEGORICAL_FEATURES_WITH_VOCABULARY,
#     label_cols=["Transported", ],
#     encoding_size=32,
#     epochs=1000,
#     learning_rate=1e-4,
#     dropout_rate=0.4,
#     batch_size=512,
#     shuffle=True,
#     test_size=0.1
# )

# spaceship_predictor, [history, evaluation_results] = run_deep_and_cross_model(
#     train_df=train_data,
#     numerical_features=NUMERICAL_FEATURES,
#     categorical_features=CATEGORICAL_FEATURES,
#     categorical_features_int=[],
#     categorical_features_with_vocabulary=CATEGORICAL_FEATURES_WITH_VOCABULARY,
#     label_cols=["Transported", ],
#     hidden_units=[256, 256,],
#     encoding_size=16,
#     epochs=1000,
#     learning_rate=1e-4,
#     dropout_rate=0.4,
#     batch_size=512,
#     shuffle=True,
#     test_size=0.2
# )

# spaceship_predictor, [history, evaluation_results] = run_wide_and_deep_model(
#     train_df=train_data,
#     numerical_features=NUMERICAL_FEATURES + BOOL_FEATURES,
#     categorical_features=CATEGORICAL_FEATURES,
#     categorical_features_int=[],
#     categorical_features_with_vocabulary=CATEGORICAL_FEATURES_WITH_VOCABULARY,
#     label_cols=["Transported", ],
#     hidden_units=[512, ],
#     encoding_size=16,
#     epochs=1000,
#     learning_rate=1e-4,
#     dropout_rate=0.55,
#     batch_size=512,
#     shuffle=True,
#     test_size=0.2
# )

Model: "model_85"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Deck (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 Side (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 HomePlanet (InputLayer)        [(None,)]            0           []                               
                                                                                                  
 Destination (InputLayer)       [(None,)]            0           []                               
                                                                                           

  return bool(asarray(a1 == a2).all())


 tf.expand_dims_765 (TFOpLambda  (None, 1)           0           ['ShoppingMall[0][0]']           
 )                                                                                                
                                                                                                  
 tf.expand_dims_766 (TFOpLambda  (None, 1)           0           ['VIP[0][0]']                    
 )                                                                                                
                                                                                                  
 tf.expand_dims_767 (TFOpLambda  (None, 1)           0           ['CryoSleep[0][0]']              
 )                                                                                                
                                                                                                  
 embedding_348 (Embedding)      (None, 16)           128         ['string_lookup_372[0][0]']      
          

In [235]:
test_data = all_data[all_data["Transported"].isna()]

test_ds = dataframe_to_dataset(test_data[NUMERICAL_FEATURES + BOOL_FEATURES + CATEGORICAL_FEATURES], shuffle=False)
test_ds_batch = test_ds.batch(1024)
predictions_batch_result = []

for x in test_ds_batch:
    predictions_batch_result.append(1 * (spaceship_predictor.predict(x) > 0.5))

y_predictions = np.concatenate(predictions_batch_result)
test_results = pd.concat([test_data["PassengerId"].reset_index(drop=True), pd.DataFrame(y_predictions, columns=["Transported"])], axis=1)
test_results["Transported"] = test_results["Transported"].astype(bool)
test_results.to_csv("submissionNN.csv", index=False)



In [236]:
!kaggle competitions submit -c spaceship-titanic -f submissionNN.csv -m "second submit"

100%|██████████████████████████████████████| 56.3k/56.3k [00:01<00:00, 42.5kB/s]
Successfully submitted to Spaceship Titanic