In [149]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow_decision_forests as tfdf
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [150]:
train_data = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [151]:
test_data = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [152]:
samp = pd.read_csv("/kaggle/input/spaceship-titanic/sample_submission.csv")
samp.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False


In [153]:
counts = train_data['Transported'].value_counts()
print("Counts:\n", counts)
freqs = train_data['Transported'].value_counts(normalize=True)
print("\nPercentages:\n", freqs * 100)

Counts:
 Transported
True     4378
False    4315
Name: count, dtype: int64

Percentages:
 Transported
True     50.362361
False    49.637639
Name: proportion, dtype: float64


In [154]:
#seeing if we have any missing data.
missing_counts = train_data.isnull().sum().sort_values(ascending=False)
missing_counts

CryoSleep       217
ShoppingMall    208
VIP             203
HomePlanet      201
Name            200
Cabin           199
VRDeck          188
FoodCourt       183
Spa             183
Destination     182
RoomService     181
Age             179
PassengerId       0
Transported       0
dtype: int64

In [155]:
def preprocess_data(data):
    data = data.copy() #just getting a copy
    def normalize_name(x):
        if pd.isna(x):
            return "Unknown"
        parts = str(x).split()
        return " ".join(p.strip(' ,()."\'') for p in parts)

    def get_title(x):
        if pd.isna(x):
            return "Unknown"
        m = re.search(r",\s*([^\.]+)\.", x)
        return m.group(1).strip() if m else "Unknown"

    data["Name"] = data["Name"].apply(normalize_name)

    cabin_split = (
        data["Cabin"]
          .fillna("Unknown/0/X") # placeholder for missing cabins
          .astype(str)
          .str.split("/", expand=True)
    )
    cabin_split.columns = ["Deck", "CabinNum", "Side"]
    cabin_split["CabinNum"] = pd.to_numeric(cabin_split["CabinNum"], errors="coerce")
    data = pd.concat([data, cabin_split], axis=1)

    
    return data

In [156]:
prep_train = preprocess_data(train_data)
prep_test = preprocess_data(test_data)

prep_train.drop(columns=["Cabin"], inplace=True)
prep_test.drop(columns=["Cabin"], inplace=True)
#now maybe try to one hot encode cryosleep and VIP
def ohe_data(data):
    data["CryoSleep"] = data["CryoSleep"].map({True:1, False:0})
    data["VIP"]       = data["VIP"].map({True:1, False:0})
    return data
ohe_train = ohe_data(prep_train)
ohe_test = ohe_data(prep_test)
ohe_train["Transported"] = ohe_train["Transported"].map({True:1, False:0})
ohe_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,CabinNum,Side
0,0001_01,Europa,0.0,TRAPPIST-1e,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,B,0,P
1,0002_01,Earth,0.0,TRAPPIST-1e,24.0,0.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,F,0,S
2,0003_01,Europa,0.0,TRAPPIST-1e,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,A,0,S
3,0003_02,Europa,0.0,TRAPPIST-1e,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,A,0,S
4,0004_01,Earth,0.0,TRAPPIST-1e,16.0,0.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,F,1,S


In [158]:
#need to split CV set and normal training set!
train_df, val_df = train_test_split(
    ohe_train, 
    test_size=0.2, 
    random_state=42, 
    stratify=ohe_train["Transported"]
)
FEATURE_COLUMNS = [
    c for c in ohe_train.columns
    if c not in ["PassengerId", "Name", "Transported"]
]

def make_tf_dataset(df, label=True):
    #makes a tf ds that tf can train on
    if label:
        return tfdf.keras.pd_dataframe_to_tf_dataset(
            df[FEATURE_COLUMNS + ["Transported"]],
            label="Transported",
            task=tfdf.keras.Task.CLASSIFICATION
        )
    else:
        return tfdf.keras.pd_dataframe_to_tf_dataset(
            df[FEATURE_COLUMNS],
            label=None
        )

train_ds = make_tf_dataset(train_df, label=True) 
val_ds   = make_tf_dataset(val_df,   label=True) 
test_ds  = make_tf_dataset(ohe_test,  label=False)


model = tfdf.keras.GradientBoostedTreesModel(
    features=[tfdf.keras.FeatureUsage(name=f) for f in FEATURE_COLUMNS],
    num_trees=200,
    max_depth=6,
    shrinkage=0.1,
    random_seed=42
)

model.fit(train_ds)
eval_dict = model.evaluate(val_ds, return_dict=True)

Use /tmp/tmph0_ro_17 as temporary training directory
Reading training dataset...


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Training dataset read in 0:00:00.360568. Found 6954 examples.
Training model...


I0000 00:00:1750635400.968088      35 kernel.cc:782] Start Yggdrasil model training
I0000 00:00:1750635400.968124      35 kernel.cc:783] Collect training examples
I0000 00:00:1750635400.968137      35 kernel.cc:795] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
column_guides {
  column_name_pattern: "^HomePlanet$"
}
column_guides {
  column_name_pattern: "^CryoSleep$"
}
column_guides {
  column_name_pattern: "^Destination$"
}
column_guides {
  column_name_pattern: "^Age$"
}
column_guides {
  column_name_pattern: "^VIP$"
}
column_guides {
  column_name_pattern: "^RoomService$"
}
column_guides {
  column_name_pattern: "^FoodCourt$"
}
column_guides {
  column_name_pattern: "^ShoppingMall$"
}
column_guides {
  column_name_pattern: "^Spa$"
}
column_guides {
  column_name_pattern: "^VRDeck$"
}
column_guides {
  column_name_pattern: "^Deck$"
}
column_guides {
  column_name_pattern:

Model trained in 0:00:01.005431
Compiling model...


I0000 00:00:1750635401.917810    3311 early_stopping.cc:54] Early stop of the training because the validation loss does not decrease anymore. Best valid-loss: 0.794082
I0000 00:00:1750635401.925970    3311 kernel.cc:926] Export model in log directory: /tmp/tmph0_ro_17 with prefix 0dfb9c67876c40db
I0000 00:00:1750635401.930153    3311 kernel.cc:944] Save model in resources
I0000 00:00:1750635401.931976      35 abstract_model.cc:914] Model self evaluation:
Task: CLASSIFICATION
Label: __LABEL
Loss (BINOMIAL_LOG_LIKELIHOOD): 0.794082

Accuracy: 0.788301  CI95[W][0 1]
ErrorRate: : 0.211699


Confusion Table:
truth\prediction
     1    2
1  263   87
2   65  303
Total: 718


I0000 00:00:1750635401.966816      35 quick_scorer_extended.cc:922] The binary was compiled without AVX2 support, but your CPU supports it. Enable it for faster model inference.
I0000 00:00:1750635401.967605      35 abstract_model.cc:1404] Engine "GradientBoostedTreesQuickScorerExtended" built


Model compiled.


In [162]:
probs = model.predict(test_ds) 
p_transport = probs.squeeze()
preds = (p_transport >= 0.5)

submission = pd.DataFrame({
    "PassengerId": ohe_test["PassengerId"],
    "Transported": preds
})
submission.to_csv("submission.csv", index=False)

