In [1]:
import pandas as pd
import catboost as cb

In [2]:
train = pd.read_csv('kaggle/input/spaceship-titanic/train.csv')
test = pd.read_csv('kaggle/input/spaceship-titanic/test.csv')

In [3]:
# Split Cabin into deck, num, and side
for df in [train, test]:
    df['Cabin'] = df['Cabin'].fillna('Unknown/-1/U')
    df['Cabin_deck'] = df['Cabin'].apply(lambda x: x.split('/')[0])
    df['Cabin_num'] = df['Cabin'].apply(lambda x: x.split('/')[1])
    df['Cabin_side'] = df['Cabin'].apply(lambda x: x.split('/')[2])

# Fill missing values and convert boolean columns explicitly
train['Age'] = train['Age'].fillna(train['Age'].median())
train['VIP'] = train['VIP'].fillna(False).astype(bool)
train['CryoSleep'] = train['CryoSleep'].fillna(False).astype(bool)
train['Cabin'] = train['Cabin'].fillna('Unknown')
train['Destination'] = train['Destination'].fillna('Unknown')
train['HomePlanet'] = train['HomePlanet'].fillna('Unknown')

test['Age'] = test['Age'].fillna(train['Age'].median())
test['VIP'] = test['VIP'].fillna(False).astype(bool)
test['CryoSleep'] = test['CryoSleep'].fillna(False).astype(bool)
test['Cabin'] = test['Cabin'].fillna('Unknown')
test['Destination'] = test['Destination'].fillna('Unknown')
test['HomePlanet'] = test['HomePlanet'].fillna('Unknown')

########################################################
# No need to encode categorical columns for catboost

"""# Combine train and test for consistent encoding, then split back
from sklearn.preprocessing import LabelEncoder

# Add a marker to split later
test['Transported'] = None  # Add dummy column to align columns
combined = pd.concat([train, test], sort=False, ignore_index=True)
#combined['CryoSleep_VIP'] = combined['CryoSleep'].astype(str) + '_' + combined['VIP'].astype(str)

# Encode categorical columns
le_home = LabelEncoder()
le_dest = LabelEncoder()
le_cabin_deck = LabelEncoder()
le_cabin_side = LabelEncoder()
#le_cryo_vip = LabelEncoder()

combined['HomePlanet_enc'] = le_home.fit_transform(combined['HomePlanet'])
combined['Destination_enc'] = le_dest.fit_transform(combined['Destination'])
combined['Cabin_deck_enc'] = le_cabin_deck.fit_transform(combined['Cabin_deck'])
combined['Cabin_side_enc'] = le_cabin_side.fit_transform(combined['Cabin_side'])
#combined['CryoSleep_VIP_enc'] = le_cryo_vip.fit_transform(combined['CryoSleep_VIP'])

# Split back into train and test
train = combined[combined['Transported'].notnull()].copy()
test = combined[combined['Transported'].isnull()].copy()

test.drop(columns=['Transported'], inplace=True)  # Remove dummy column from test"""

  train['VIP'] = train['VIP'].fillna(False).astype(bool)
  train['CryoSleep'] = train['CryoSleep'].fillna(False).astype(bool)
  test['VIP'] = test['VIP'].fillna(False).astype(bool)
  test['CryoSleep'] = test['CryoSleep'].fillna(False).astype(bool)


"# Combine train and test for consistent encoding, then split back\nfrom sklearn.preprocessing import LabelEncoder\n\n# Add a marker to split later\ntest['Transported'] = None  # Add dummy column to align columns\ncombined = pd.concat([train, test], sort=False, ignore_index=True)\n#combined['CryoSleep_VIP'] = combined['CryoSleep'].astype(str) + '_' + combined['VIP'].astype(str)\n\n# Encode categorical columns\nle_home = LabelEncoder()\nle_dest = LabelEncoder()\nle_cabin_deck = LabelEncoder()\nle_cabin_side = LabelEncoder()\n#le_cryo_vip = LabelEncoder()\n\ncombined['HomePlanet_enc'] = le_home.fit_transform(combined['HomePlanet'])\ncombined['Destination_enc'] = le_dest.fit_transform(combined['Destination'])\ncombined['Cabin_deck_enc'] = le_cabin_deck.fit_transform(combined['Cabin_deck'])\ncombined['Cabin_side_enc'] = le_cabin_side.fit_transform(combined['Cabin_side'])\n#combined['CryoSleep_VIP_enc'] = le_cryo_vip.fit_transform(combined['CryoSleep_VIP'])\n\n# Split back into train and te

In [4]:
spend_cols = ['RoomService','Spa', 'VRDeck']
train['TotalSpend'] = train[spend_cols].fillna(0).sum(axis=1)
train['NoSpend'] = (train[spend_cols].fillna(0).sum(axis=1) == 0).astype(bool)
test['TotalSpend'] = test[spend_cols].fillna(0).sum(axis=1)
test['NoSpend'] = (test[spend_cols].fillna(0).sum(axis=1) == 0).astype(bool)

In [5]:
# List of categorical feature names (as they appear in your DataFrame)
cat_features = ['HomePlanet', 'Destination', 'Cabin_deck', 'Cabin_side', 'NoSpend', 'CryoSleep', 'VIP']

features = ['HomePlanet', 'Destination', 'Cabin_deck', 'Cabin_side', 'NoSpend', 'CryoSleep', 'VIP', 'TotalSpend', 'Age']

X = train[features]
y = train['Transported'].astype(int)
X_test = test[features]

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier

# {'learning_rate': 0.03, 'l2_leaf_reg': 5, 'iterations': 700, 'depth': 5, 'bagging_temperature': 2}
best_params = {
    'learning_rate': 0.03, 
    'l2_leaf_reg': 5, 
    'iterations': 700, 
    'depth': 5, 
    'bagging_temperature': 2
}
model = CatBoostClassifier(
    cat_features=cat_features,
    loss_function='Logloss',
    eval_metric='Accuracy',
    verbose=0,
    random_seed=42,
    **best_params
)
model.fit(X, y)
test['Transported'] = model.predict(X_test)
test['Transported'] = test['Transported'].astype(bool)


# used for tuning
"""param_dist = {
    'iterations': [300, 500, 700, 1000],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'depth': [4, 5, 6, 7, 8],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'bagging_temperature': [0, 0.5, 1, 2]
}

model = CatBoostClassifier(
    cat_features=cat_features,
    loss_function='Logloss',
    eval_metric='Accuracy',
    verbose=0,
    random_seed=42
)

search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=20,
    scoring='accuracy',
    cv=3,
    verbose=2,
    n_jobs=-1
)

search.fit(X, y)
print('Best params:', search.best_params_)
print('Best score:', search.best_score_)"""

"param_dist = {\n    'iterations': [300, 500, 700, 1000],\n    'learning_rate': [0.01, 0.03, 0.05, 0.1],\n    'depth': [4, 5, 6, 7, 8],\n    'l2_leaf_reg': [1, 3, 5, 7, 9],\n    'bagging_temperature': [0, 0.5, 1, 2]\n}\n\nmodel = CatBoostClassifier(\n    cat_features=cat_features,\n    loss_function='Logloss',\n    eval_metric='Accuracy',\n    verbose=0,\n    random_seed=42\n)\n\nsearch = RandomizedSearchCV(\n    model,\n    param_distributions=param_dist,\n    n_iter=20,\n    scoring='accuracy',\n    cv=3,\n    verbose=2,\n    n_jobs=-1\n)\n\nsearch.fit(X, y)\nprint('Best params:', search.best_params_)\nprint('Best score:', search.best_score_)"

In [7]:
submission = test[['PassengerId', 'Transported']]
submission.to_csv('submission.csv', index=False)

In [8]:
catboost_probs = model.predict_proba(X_test)[:, 1] 

In [9]:
catboost_probs
%store catboost_probs

Stored 'catboost_probs' (ndarray)
