In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

In [3]:
# 1. Data Loading and Exploration
train_data = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [4]:
# 2. Data Preprocessing
# a. Handle Missing Values
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
categorical_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
cabin_cols = ['Cabin']

imputer_numerical = SimpleImputer(strategy='median')  # Impute numerical with median
train_data[numerical_cols] = imputer_numerical.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = imputer_numerical.transform(test_data[numerical_cols])

imputer_categorical = SimpleImputer(strategy='most_frequent')  # Impute categorical with mode
train_data[categorical_cols] = imputer_categorical.fit_transform(train_data[categorical_cols])
test_data[categorical_cols] = imputer_categorical.transform(test_data[categorical_cols])

# Cabin Feature Handling: Split and Impute
def split_cabin(df):
    df[['Cabin_Deck', 'Cabin_Num', 'Cabin_Side']] = df['Cabin'].str.split('/', expand=True)
    df.drop('Cabin', axis=1, inplace=True)
    return df

train_data = split_cabin(train_data)
test_data = split_cabin(test_data)

cabin_cols_new = ['Cabin_Deck', 'Cabin_Num', 'Cabin_Side'] # added this to process the new columns
train_data[cabin_cols_new] = imputer_categorical.fit_transform(train_data[cabin_cols_new])
test_data[cabin_cols_new] = imputer_categorical.transform(test_data[cabin_cols_new])


# b. Feature Engineering (Example: Total Spending)
train_data['TotalSpending'] = train_data['RoomService'] + train_data['FoodCourt'] + train_data['ShoppingMall'] + train_data['Spa'] + train_data['VRDeck']
test_data['TotalSpending'] = test_data['RoomService'] + test_data['FoodCourt'] + test_data['ShoppingMall'] + test_data['Spa'] + test_data['VRDeck']

# c. Encoding Categorical Features
label_encoder = LabelEncoder()
train_data['Transported'] = label_encoder.fit_transform(train_data['Transported'])  # Encode target

# Identify categorical columns for one-hot encoding
categorical_cols_for_encoding = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_Deck', 'Cabin_Side']

# Perform one-hot encoding
train_data = pd.get_dummies(train_data, columns=categorical_cols_for_encoding, drop_first=True)
test_data = pd.get_dummies(test_data, columns=categorical_cols_for_encoding, drop_first=True)

# d. Feature Scaling
numerical_cols_for_scaling = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpending'] #added TotalSpending

scaler = StandardScaler()
train_data[numerical_cols_for_scaling] = scaler.fit_transform(train_data[numerical_cols_for_scaling])
test_data[numerical_cols_for_scaling] = scaler.transform(test_data[numerical_cols_for_scaling])

In [5]:
# 3. Model Training

# a. Prepare Data for Modeling
X = train_data.drop(['PassengerId', 'Name', 'Transported'], axis=1)
y = train_data['Transported']

# Align columns between train and test sets
train_cols = X.columns
test_cols = test_data.drop(['PassengerId', 'Name'], axis=1).columns

missing_cols_train = set(test_cols) - set(train_cols)
for c in missing_cols_train:
    X[c] = 0

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# b. CatBoost with GridSearchCV
cat_model = CatBoostClassifier(verbose=0, random_state=42) # Suppress verbose output
cat_param_grid = {
    'iterations': [100, 200],
    'learning_rate': [0.01, 0.05],
    'depth': [4, 6],
    'l2_leaf_reg': [1, 3]
}

cat_grid = GridSearchCV(cat_model, cat_param_grid, cv=5, scoring='accuracy', verbose=0, n_jobs=-1) #added verbose and n_jobs

cat_grid.fit(X_train, y_train)
print("CatBoost Best Parameters:", cat_grid.best_params_)

cat_best_model = cat_grid.best_estimator_
cat_predictions = cat_best_model.predict(X_val)
print("CatBoost Validation Accuracy:", accuracy_score(y_val, cat_predictions))


# c. Gradient Boosting with GridSearchCV
gb_model = GradientBoostingClassifier(random_state=42)
gb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05],
    'max_depth': [3, 5],
    'min_samples_split': [2, 4]
}

gb_grid = GridSearchCV(gb_model, gb_param_grid, cv=5, scoring='accuracy', verbose=0, n_jobs=-1) #added verbose and n_jobs

gb_grid.fit(X_train, y_train)
print("Gradient Boosting Best Parameters:", gb_grid.best_params_)

gb_best_model = gb_grid.best_estimator_
gb_predictions = gb_best_model.predict(X_val)
print("Gradient Boosting Validation Accuracy:", accuracy_score(y_val, gb_predictions))

CatBoost Best Parameters: {'depth': 6, 'iterations': 200, 'l2_leaf_reg': 1, 'learning_rate': 0.05}
CatBoost Validation Accuracy: 0.8033352501437608
Gradient Boosting Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 200}
Gradient Boosting Validation Accuracy: 0.7981598619896493


In [7]:
# 4. Prediction and Submission
# a. Predict on Test Data
test_X = test_data.drop(['PassengerId', 'Name'], axis=1)

cat_test_predictions = cat_best_model.predict(test_X)
gb_test_predictions = gb_best_model.predict(test_X)


# Choose the better model (based on validation accuracy, for example)
# Here, I am assuming CatBoost performed better.  Adjust this based on your results.
final_predictions = cat_test_predictions

In [8]:
# b. Create Submission File
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Transported': final_predictions})
submission['Transported'] = submission['Transported'].map({0: False, 1: True}) #Convert predictions back to True/False
submission.to_csv('submission.csv', index=False)