In [29]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='xgboost')

In [30]:
train = pd.read_csv('kaggle/input/spaceship-titanic/train.csv')
test = pd.read_csv('kaggle/input/spaceship-titanic/test.csv')

In [31]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [32]:
# Fill missing values and convert boolean columns explicitly
# Split Cabin into deck, num, and side
for df in [train, test]:
    df['Cabin'] = df['Cabin'].fillna('Unknown/-1/U')
    df['Cabin_deck'] = df['Cabin'].apply(lambda x: x.split('/')[0])
    df['Cabin_num'] = df['Cabin'].apply(lambda x: x.split('/')[1])
    df['Cabin_side'] = df['Cabin'].apply(lambda x: x.split('/')[2])

train['Age'] = train['Age'].fillna(train['Age'].median())
train['VIP'] = train['VIP'].fillna(False).astype(bool)
train['CryoSleep'] = train['CryoSleep'].fillna(False).astype(bool)
train['Cabin'] = train['Cabin'].fillna('Unknown')
train['Destination'] = train['Destination'].fillna('Unknown')
train['HomePlanet'] = train['HomePlanet'].fillna('Unknown')
train["Group"] = train["PassengerId"].astype(str).str.split("_").str[0]
train["GroupSize"] = train.groupby("Group")["PassengerId"].transform("count")
train["IsAlone"] = (train["GroupSize"] == 1).astype(int)
train["IsChild"] = (train["Age"] < 13).fillna(False).astype(int)
train["IsTeen"] = ((train["Age"] >= 13) & (train["Age"] < 18)).fillna(False).astype(int)
train["IsSenior"] = (train["Age"] >= 65).fillna(False).astype(int)

test['Age'] = test['Age'].fillna(train['Age'].median())
test['VIP'] = test['VIP'].fillna(False).astype(bool)
test['CryoSleep'] = test['CryoSleep'].fillna(False).astype(bool)
test['Cabin'] = test['Cabin'].fillna('Unknown')
test['Destination'] = test['Destination'].fillna('Unknown')
test['HomePlanet'] = test['HomePlanet'].fillna('Unknown')
test["Group"] = test["PassengerId"].astype(str).str.split("_").str[0]
test["GroupSize"] = test.groupby("Group")["PassengerId"].transform("count")
test["IsAlone"] = (test["GroupSize"] == 1).astype(int)
test["IsChild"] = (test["Age"] < 13).fillna(False).astype(int)
test["IsTeen"] = ((test["Age"] >= 13) & (test["Age"] < 18)).fillna(False).astype(int)
test["IsSenior"] = (test["Age"] >= 65).fillna(False).astype(int)

spend_cols = ['RoomService','Spa', 'VRDeck', 'FoodCourt', 'ShoppingMall']
train['TotalSpend'] = train[spend_cols].fillna(0).sum(axis=1)
train['NoSpend'] = (train[spend_cols].fillna(0).sum(axis=1) == 0).astype(int)
test['TotalSpend'] = test[spend_cols].fillna(0).sum(axis=1)
test['NoSpend'] = (test[spend_cols].fillna(0).sum(axis=1) == 0).astype(int)

########################################################

# Combine train and test for consistent encoding, then split back
from sklearn.preprocessing import LabelEncoder

# Add a marker to split later
test['Transported'] = None  # Add dummy column to align columns
combined = pd.concat([train, test], sort=False, ignore_index=True)
#combined['CryoSleep_VIP'] = combined['CryoSleep'].astype(str) + '_' + combined['VIP'].astype(str)

# Encode categorical columns
le_home = LabelEncoder()
le_dest = LabelEncoder()
le_cabin_deck = LabelEncoder()
le_cabin_side = LabelEncoder()
#le_cryo_vip = LabelEncoder()

combined['HomePlanet_enc'] = le_home.fit_transform(combined['HomePlanet'])
combined['Destination_enc'] = le_dest.fit_transform(combined['Destination'])
combined['Cabin_deck_enc'] = le_cabin_deck.fit_transform(combined['Cabin_deck'])
combined['Cabin_side_enc'] = le_cabin_side.fit_transform(combined['Cabin_side'])
#combined['CryoSleep_VIP_enc'] = le_cryo_vip.fit_transform(combined['CryoSleep_VIP'])

# Split back into train and test
train = combined[combined['Transported'].notnull()].copy()
test = combined[combined['Transported'].isnull()].copy()

test.drop(columns=['Transported'], inplace=True)  # Remove dummy column from test

  train['VIP'] = train['VIP'].fillna(False).astype(bool)
  train['CryoSleep'] = train['CryoSleep'].fillna(False).astype(bool)
  test['VIP'] = test['VIP'].fillna(False).astype(bool)
  test['CryoSleep'] = test['CryoSleep'].fillna(False).astype(bool)


In [33]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,IsAlone,IsChild,IsTeen,IsSenior,TotalSpend,NoSpend,HomePlanet_enc,Destination_enc,Cabin_deck_enc,Cabin_side_enc
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,1,0,0,0,0.0,1,1,2,1,0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,1,0,0,0,736.0,0,0,2,5,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,0,0,0,0,10383.0,0,1,2,0,1
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,0,0,0,0,5176.0,0,1,2,0,1
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,1,0,1,0,1091.0,0,0,2,5,1


In [34]:
transported = train[train['Transported'] == True]
not_transported = train[train['Transported'] == False]


In [35]:
print("--- True ---")
for col in ['IsChild', 'IsTeen', 'IsSenior', 'IsAlone', 'CryoSleep', 'VIP', 'NoSpend']:
    print(f"{col}: {round((transported[transported[col] == True]['Transported'].count() / transported['Transported'].count())*100, 2)}%")

print("--- False ---")
for col in ['IsChild', 'IsTeen', 'IsSenior', 'IsAlone', 'CryoSleep', 'VIP', 'NoSpend']:
    print(f"{col}: {transported[transported[col] == False]['Transported'].count() / transported['Transported'].count()}")

transported[
    (transported['CryoSleep']) &
    (~transported['VIP']) &
    (transported['NoSpend'])
]['Transported'].value_counts()

--- True ---
IsChild: 12.88%
IsTeen: 9.34%
IsSenior: 1.07%
IsAlone: 49.66%
CryoSleep: 56.72%
VIP: 1.74%
NoSpend: 65.62%
--- False ---
IsChild: 0.8711740520785747
IsTeen: 0.9065783462768388
IsSenior: 0.9892645043398812
IsAlone: 0.5034262220191869
CryoSleep: 0.43284604842393787
VIP: 0.9826404751027866
NoSpend: 0.34376427592507997


Transported
True    2462
Name: count, dtype: int64

In [36]:
print("--- TRANSPORTED ---")
print(f"room service: {transported['RoomService'].mean()}")
print(f"food court: {transported['FoodCourt'].mean()}")
print(f"shopping mall: {transported['ShoppingMall'].mean()}")
print(f"spa: {transported['Spa'].mean()}")
print(f"vr deck: {transported['VRDeck'].mean()}")
print(f"age: {transported['Age'].mean()}")
print(f"vip: {transported['VIP'].value_counts()}")
print(f"cryo sleep: {transported['CryoSleep'].value_counts()}")

--- TRANSPORTED ---
room service: 63.09802095459837
food court: 532.6919841084366
shopping mall: 179.82997185741087
spa: 61.675530674131096
vr deck: 69.1481308411215
age: 27.733439926907263
vip: VIP
False    4302
True       76
Name: count, dtype: int64
cryo sleep: CryoSleep
True     2483
False    1895
Name: count, dtype: int64


In [37]:
print("--- NOT TRANSPORTED ---")
print(f"room service: {not_transported['RoomService'].mean()}")
print(f"food court: {not_transported['FoodCourt'].mean()}")
print(f"shopping mall: {not_transported['ShoppingMall'].mean()}")
print(f"spa: {not_transported['Spa'].mean()}")
print(f"vr deck: {not_transported['VRDeck'].mean()}")
print(f"age: {not_transported['Age'].mean()}")
print(f"vip: {not_transported['VIP'].value_counts()}")
print(f"cryo sleep: {not_transported['CryoSleep'].value_counts()}")

--- NOT TRANSPORTED ---
room service: 389.2660659236424
food court: 382.61593004017965
shopping mall: 167.5662165363658
spa: 564.3826663509353
vr deck: 543.6298224852071
age: 29.862572421784474
vip: VIP
False    4192
True      123
Name: count, dtype: int64
cryo sleep: CryoSleep
False    3761
True      554
Name: count, dtype: int64


In [38]:
train.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported', 'Cabin_deck', 'Cabin_num', 'Cabin_side', 'Group',
       'GroupSize', 'IsAlone', 'IsChild', 'IsTeen', 'IsSenior', 'TotalSpend',
       'NoSpend', 'HomePlanet_enc', 'Destination_enc', 'Cabin_deck_enc',
       'Cabin_side_enc'],
      dtype='object')

In [39]:
#from sklearn.ensemble import RandomForestClassifier
# Prepare features and target
features = ['CryoSleep', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'NoSpend', 'HomePlanet_enc', 'Destination_enc', 'Cabin_deck_enc', 'Cabin_side_enc', 'GroupSize', 'IsAlone', 'IsChild', 'IsTeen', 'IsSenior']
X = train[features]
y = train['Transported'].astype(int)
X_test = test[features]

#model = RandomForestClassifier(random_state=42)
# {'subsample': 0.7, 'n_estimators': 100, 'max_depth': 7, 'learning_rate': 0.05, 'colsample_bytree': 0.8}

#feature selection
from sklearn.feature_selection import SequentialFeatureSelector
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

# X = your features DataFrame, y = your target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Forward selection: start with no features, add one at a time
sfs = SequentialFeatureSelector(
    model,
    n_features_to_select='auto',  # or set a number
    direction='backward',          # or 'backward'
    scoring='accuracy',
    cv=5,                         # 5-fold cross-validation
    n_jobs=-1
)
sfs.fit(X_train, y_train)

# Get the best feature subset
selected_features = X_train.columns[sfs.get_support()]
print("Best features:", list(selected_features))

# Evaluate on test set
model.fit(X_train[selected_features], y_train)
print("Test accuracy:", model.score(X_test[selected_features], y_test))



Best features: ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'HomePlanet_enc', 'Cabin_deck_enc', 'Cabin_side_enc', 'IsSenior']
Test accuracy: 0.7866589994249569


In [40]:
features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'HomePlanet_enc', 'Cabin_deck_enc', 'Cabin_side_enc', 'IsSenior']
X = train[features]
y = train['Transported'].astype(int)
X_test = test[features]

from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
search = RandomizedSearchCV(xgb_clf, param_distributions=param_dist, n_iter=20, scoring='accuracy', cv=3, random_state=42)
search.fit(X, y)
print('Best parameters:', search.best_params_)
print('Best cross-validation score:', search.best_score_)

Best parameters: {'subsample': 0.9, 'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.05, 'colsample_bytree': 1.0}
Best cross-validation score: 0.7946635974055644


In [41]:
best_params = {'subsample': 0.9, 'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.05, 'colsample_bytree': 1.0}

model = XGBClassifier(**best_params)
model.fit(X, y)

test['Transported'] = model.predict(X_test)
test['Transported'] = test['Transported'].astype(bool) # convert back

In [42]:
submission = test[['PassengerId', 'Transported']]
submission.to_csv('submission.csv', index=False)