In [7]:
import pandas as pd

In [8]:
train = pd.read_csv('kaggle/input/spaceship-titanic/train.csv')
test = pd.read_csv('kaggle/input/spaceship-titanic/test.csv')

In [9]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [10]:

# Split Cabin into deck, num, and side
for df in [train, test]:
    df['Cabin'] = df['Cabin'].fillna('Unknown/-1/U')
    df['Cabin_deck'] = df['Cabin'].apply(lambda x: x.split('/')[0])
    df['Cabin_num'] = df['Cabin'].apply(lambda x: x.split('/')[1])
    df['Cabin_side'] = df['Cabin'].apply(lambda x: x.split('/')[2])


In [11]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Cabin_deck,Cabin_num,Cabin_side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S


In [12]:
train.Cabin_num.unique()


array(['0', '1', '2', ..., '1892', '1893', '1894'],
      shape=(1818,), dtype=object)

In [13]:
# Fill missing values and convert boolean columns explicitly
train['Age'] = train['Age'].fillna(train['Age'].median())
train['VIP'] = train['VIP'].fillna(False).astype(bool)
train['CryoSleep'] = train['CryoSleep'].fillna(False).astype(bool)
train['Cabin'] = train['Cabin'].fillna('Unknown')
train['Destination'] = train['Destination'].fillna('Unknown')
train['HomePlanet'] = train['HomePlanet'].fillna('Unknown')

test['Age'] = test['Age'].fillna(train['Age'].median())
test['VIP'] = test['VIP'].fillna(False).astype(bool)
test['CryoSleep'] = test['CryoSleep'].fillna(False).astype(bool)
test['Cabin'] = test['Cabin'].fillna('Unknown')
test['Destination'] = test['Destination'].fillna('Unknown')
test['HomePlanet'] = test['HomePlanet'].fillna('Unknown')

########################################################

# Combine train and test for consistent encoding, then split back
from sklearn.preprocessing import LabelEncoder

# Add a marker to split later
test['Transported'] = None  # Add dummy column to align columns
combined = pd.concat([train, test], sort=False, ignore_index=True)
#combined['CryoSleep_VIP'] = combined['CryoSleep'].astype(str) + '_' + combined['VIP'].astype(str)

# Encode categorical columns
le_home = LabelEncoder()
le_dest = LabelEncoder()
le_cabin_deck = LabelEncoder()
le_cabin_side = LabelEncoder()
#le_cryo_vip = LabelEncoder()

combined['HomePlanet_enc'] = le_home.fit_transform(combined['HomePlanet'])
combined['Destination_enc'] = le_dest.fit_transform(combined['Destination'])
combined['Cabin_deck_enc'] = le_cabin_deck.fit_transform(combined['Cabin_deck'])
combined['Cabin_side_enc'] = le_cabin_side.fit_transform(combined['Cabin_side'])
#combined['CryoSleep_VIP_enc'] = le_cryo_vip.fit_transform(combined['CryoSleep_VIP'])

# Split back into train and test
train = combined[combined['Transported'].notnull()].copy()
test = combined[combined['Transported'].isnull()].copy()

test.drop(columns=['Transported'], inplace=True)  # Remove dummy column from test

  train['VIP'] = train['VIP'].fillna(False).astype(bool)
  train['CryoSleep'] = train['CryoSleep'].fillna(False).astype(bool)
  test['VIP'] = test['VIP'].fillna(False).astype(bool)
  test['CryoSleep'] = test['CryoSleep'].fillna(False).astype(bool)


In [14]:
print(train['Transported'].value_counts())

Transported
True     4378
False    4315
Name: count, dtype: int64


In [15]:
transported = train[train['Transported'] == True]
not_transported = train[train['Transported'] == False]

In [16]:
print("--- TRANSPORTED ---")
print(f"room service: {transported['RoomService'].mean()}")
print(f"food court: {transported['FoodCourt'].mean()}")
print(f"shopping mall: {transported['ShoppingMall'].mean()}")
print(f"spa: {transported['Spa'].mean()}")
print(f"vr deck: {transported['VRDeck'].mean()}")
print(f"age: {transported['Age'].mean()}")
print(f"vip: {transported['VIP'].value_counts()}")
print(f"cryo sleep: {transported['CryoSleep'].value_counts()}")

--- TRANSPORTED ---
room service: 63.09802095459837
food court: 532.6919841084366
shopping mall: 179.82997185741087
spa: 61.675530674131096
vr deck: 69.1481308411215
age: 27.733439926907263
vip: VIP
False    4302
True       76
Name: count, dtype: int64
cryo sleep: CryoSleep
True     2483
False    1895
Name: count, dtype: int64


In [17]:
print("--- NOT TRANSPORTED ---")
print(f"room service: {not_transported['RoomService'].mean()}")
print(f"food court: {not_transported['FoodCourt'].mean()}")
print(f"shopping mall: {not_transported['ShoppingMall'].mean()}")
print(f"spa: {not_transported['Spa'].mean()}")
print(f"vr deck: {not_transported['VRDeck'].mean()}")
print(f"age: {not_transported['Age'].mean()}")
print(f"vip: {not_transported['VIP'].value_counts()}")
print(f"cryo sleep: {not_transported['CryoSleep'].value_counts()}")



--- NOT TRANSPORTED ---
room service: 389.2660659236424
food court: 382.61593004017965
shopping mall: 167.5662165363658
spa: 564.3826663509353
vr deck: 543.6298224852071
age: 29.862572421784474
vip: VIP
False    4192
True      123
Name: count, dtype: int64
cryo sleep: CryoSleep
False    3761
True      554
Name: count, dtype: int64


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Example: Feature engineering
# Take out FoodCourt and ShoppingMall
spend_cols = ['RoomService','Spa', 'VRDeck']
train['TotalSpend'] = train[spend_cols].fillna(0).sum(axis=1)
train['NoSpend'] = (train[spend_cols].fillna(0).sum(axis=1) == 0).astype(int)
test['TotalSpend'] = test[spend_cols].fillna(0).sum(axis=1)
test['NoSpend'] = (test[spend_cols].fillna(0).sum(axis=1) == 0).astype(int)

# Prepare features and target
features = ['CryoSleep', 'VIP', 'TotalSpend', 'NoSpend', 'HomePlanet_enc', 'Destination_enc', 'Cabin_deck_enc', 'Cabin_side_enc']
X = train[features]
y = train['Transported'].astype(int)
X_test = test[features]

#model = RandomForestClassifier(random_state=42)
# {'subsample': 0.7, 'n_estimators': 100, 'max_depth': 7, 'learning_rate': 0.05, 'colsample_bytree': 0.8}

import xgboost as xgb
best_params = {
    'subsample': 0.7,
    'n_estimators': 100,
    'max_depth': 7,
    'learning_rate': 0.05,
    'colsample_bytree': 0.8,
    'use_label_encoder': False,
    'eval_metric': 'logloss',
    'random_state': 42
}
model = xgb.XGBClassifier(**best_params)
model.fit(X, y)

test['Transported'] = model.predict(X_test)
test['Transported'] = test['Transported'].astype(bool) # convert back


# used for tuning
"""from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
search = RandomizedSearchCV(xgb_clf, param_distributions=param_dist, n_iter=20, scoring='accuracy', cv=3, random_state=42)
search.fit(X, y)
print('Best parameters:', search.best_params_)
print('Best cross-validation score:', search.best_score_)"""



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


"from sklearn.model_selection import RandomizedSearchCV\nimport xgboost as xgb\n\nparam_dist = {\n    'n_estimators': [100, 200, 300, 400],\n    'max_depth': [3, 4, 5, 6, 7],\n    'learning_rate': [0.01, 0.05, 0.1, 0.2],\n    'subsample': [0.7, 0.8, 0.9, 1.0],\n    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]\n}\n\nxgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)\nsearch = RandomizedSearchCV(xgb_clf, param_distributions=param_dist, n_iter=20, scoring='accuracy', cv=3, random_state=42)\nsearch.fit(X, y)\nprint('Best parameters:', search.best_params_)\nprint('Best cross-validation score:', search.best_score_)"

In [19]:
#submission = test[['PassengerId', 'Transported']]
#submission.to_csv('submission.csv', index=False)

In [20]:
xgb_probs = model.predict_proba(X_test)[:, 1]

In [21]:
xgb_probs

array([0.70464486, 0.04068949, 0.99206173, ..., 0.9548554 , 0.65604943,
       0.7120803 ], shape=(4277,), dtype=float32)

In [22]:
import import_ipynb
import spaceship_titanic_catboost

In [23]:
%store -r catboost_probs
print(catboost_probs)

[0.69701264 0.04010205 0.99789125 ... 0.93278865 0.60660295 0.71814505]


In [24]:
ensemble_probs = (catboost_probs + xgb_probs) / 2

In [25]:
ensemble_probs

array([0.70082875, 0.04039577, 0.99497649, ..., 0.94382202, 0.63132619,
       0.71511267], shape=(4277,))

In [26]:
test['Transported'] = ensemble_probs > 0.5

In [27]:
submission = test[['PassengerId', 'Transported']]
submission.to_csv('submission_ensemble.csv', index=False)