In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import datetime
from datetime import datetime

In [3]:
# Get current date in YYYY-MM-DD format
current_date = datetime.now().strftime('%Y-%m-%d')

In [4]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [5]:
df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [6]:
df_test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [8]:
df_train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [9]:
df_train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [45]:
# Identify categorical columns
categorical_columns = df_train.select_dtypes(include=['object', 'bool']).columns

# Exclude PassengerId and Name from categorical columns
categorical_columns_exclusion = ['PassengerId', 'Name', 'Cabin']
categorical_columns = [col for col in categorical_columns if col not in categorical_columns_exclusion]

print("Categorical columns:", list(categorical_columns))

Categorical columns: ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Transported']


In [46]:
# Identify numerical columns
numerical_columns = df_train.select_dtypes(include=['int64', 'float64']).columns

# Exclude PassengerId and Transported from numerical columns
#numerical_columns_exclusion = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
numerical_columns_exclusion = ['ShoppingMall']
numerical_columns = [col for col in numerical_columns if col not in numerical_columns_exclusion]
print("\nNumerical columns:", list(numerical_columns))


Numerical columns: ['Age', 'RoomService', 'FoodCourt', 'Spa', 'VRDeck']


In [47]:
# Print value counts for each categorical column
print("\nValue counts for each categorical column:")
print("==========================================")
for categorical_column in categorical_columns:
    print(f"\nColumn: {categorical_column}")
    print(df_train[categorical_column].value_counts())
    print("Null values:", df_train[categorical_column].isnull().sum())
    print("------------------------------------------------------")


Value counts for each categorical column:

Column: HomePlanet
HomePlanet
Earth     4602
Europa    2131
Mars      1759
Name: count, dtype: int64
Null values: 201
------------------------------------------------------

Column: CryoSleep
CryoSleep
False    5439
True     3037
Name: count, dtype: int64
Null values: 217
------------------------------------------------------

Column: Destination
Destination
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: count, dtype: int64
Null values: 182
------------------------------------------------------

Column: VIP
VIP
False    8291
True      199
Name: count, dtype: int64
Null values: 203
------------------------------------------------------

Column: Transported
Transported
True     4378
False    4315
Name: count, dtype: int64
Null values: 0
------------------------------------------------------


In [72]:
#Initialize transformers
cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='median')
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
scaler = StandardScaler()

In [73]:
def get_categorical_numerical_columns(df):
    """Identify categorical and numerical columns"""
    # Get categorical and numerical columns
    categorical_columns = df.select_dtypes(include=['object', 'bool']).columns
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

    # Exclude specific columns
    categorical_columns_exclusion = [
        'PassengerId', 
        'Name', 
        'Transported', 
        'Cabin',
        # 'Deck',        # Add these new exclusions
        # 'CabinNum',    # Add these new exclusions
        # 'Side'         # Add these new exclusions
    ]
    categorical_columns = [col for col in categorical_columns if col not in categorical_columns_exclusion]

    numerical_columns_exclusion = ['PassengerId', 'Transported', 'ShoppingMall']
    numerical_columns = [col for col in numerical_columns if col not in numerical_columns_exclusion]

    return categorical_columns, numerical_columns


In [74]:
def process_numerical(df, is_training=True):
    """Handle missing values in numerical columns"""
    global num_imputer
    _, numerical_columns = get_categorical_numerical_columns(df)

    if not numerical_columns:
        return df

    if is_training:
        num_imputer = SimpleImputer(strategy='median')
        df[numerical_columns] = num_imputer.fit_transform(df[numerical_columns])
    else:
        df[numerical_columns] = num_imputer.transform(df[numerical_columns])

    return df


In [75]:
def scale_features(df, is_training=True):
    """Scale numerical features"""
    global scaler
    _, numerical_columns = get_categorical_numerical_columns(df)

    if not numerical_columns:
        return df

    if is_training:
        scaler = StandardScaler()
        df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    else:
        df[numerical_columns] = scaler.transform(df[numerical_columns])

    return df


In [76]:
def process_categorical(df, is_training=True):
    """Handle missing values and encode categorical columns"""
    global cat_imputer, encoder
    categorical_columns, _ = get_categorical_numerical_columns(df)

    if not categorical_columns:
        return df

    if is_training:
        # Impute missing values
        cat_imputer = SimpleImputer(strategy='most_frequent')
        df[categorical_columns] = cat_imputer.fit_transform(df[categorical_columns])

        # Encode categorical variables
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded_features = encoder.fit_transform(df[categorical_columns])
    else:
        df[categorical_columns] = cat_imputer.transform(df[categorical_columns])
        encoded_features = encoder.transform(df[categorical_columns])

    # Create encoded DataFrame
    encoded_feature_names = encoder.get_feature_names_out(categorical_columns)
    encoded_df = pd.DataFrame(
        encoded_features, 
        columns=encoded_feature_names, 
        index=df.index
    )

    # Drop original categorical columns and add encoded ones
    df = df.drop(columns=categorical_columns)
    df = pd.concat([df, encoded_df], axis=1)

    return df


In [77]:
def feature_engineering(df):
    """Create new features"""
    df = df.copy()

    # # Extract group size from PassengerId
    # df['Group'] = df['PassengerId'].str.split('_').str[0]
    # df['GroupSize'] = df.groupby('Group')['PassengerId'].transform('count')

    # # Split Cabin into deck, num, and side
    # if 'Cabin' in df.columns:
    #     cabin_split = df['Cabin'].str.split('/', expand=True)
    #     if cabin_split is not None:
    #         df['Deck'] = cabin_split[0]
    #         df['CabinNum'] = cabin_split[1]
    #         df['Side'] = cabin_split[2]

    # Create total spending feature
    spending_columns = ['RoomService', 'FoodCourt', 'Spa', 'VRDeck']
    existing_spending_columns = [col for col in spending_columns if col in df.columns]
    if existing_spending_columns:
        df['TotalSpending'] = df[existing_spending_columns].sum(axis=1)
        df['AvgSpending'] = df['TotalSpending'] / len(existing_spending_columns)

    # Convert boolean strings to integers
    if 'CryoSleep' in df.columns:
        df['CryoSleep'] = df['CryoSleep'].astype(str).map({'True': 1, 'False': 0})
    if 'VIP' in df.columns:
        df['VIP'] = df['VIP'].astype(str).map({'True': 1, 'False': 0})

    return df


In [78]:
def prepare_data(df, is_training=True):
    """Main function to prepare data"""
    # Create copy of dataframe
    df = df.copy()

    # Feature engineering
    df = feature_engineering(df)

    # Process numerical and categorical data
    df = process_numerical(df, is_training)
    df = scale_features(df, is_training)
    df = process_categorical(df, is_training)

    # Drop unnecessary columns
    columns_to_drop = ['Name', 'PassengerId', 'Cabin', 'Group']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    return df



In [79]:
categorical_columns, numerical_columns = get_categorical_numerical_columns(df_train)
print(categorical_columns)
print(numerical_columns)

['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
['Age', 'RoomService', 'FoodCourt', 'Spa', 'VRDeck']


In [80]:
df_train_processed = prepare_data(df_train.copy(), is_training=True)
df_test_processed = prepare_data(df_test.copy(), is_training=False)

In [81]:
df_train_processed.isnull().sum()

CryoSleep                      0
Age                            0
VIP                            0
RoomService                    0
FoodCourt                      0
ShoppingMall                 208
Spa                            0
VRDeck                         0
Transported                    0
TotalSpending                  0
AvgSpending                    0
HomePlanet_Earth               0
HomePlanet_Europa              0
HomePlanet_Mars                0
Destination_55 Cancri e        0
Destination_PSO J318.5-22      0
Destination_TRAPPIST-1e        0
dtype: int64

In [82]:
df_train_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  8693 non-null   float64
 1   Age                        8693 non-null   float64
 2   VIP                        8693 non-null   float64
 3   RoomService                8693 non-null   float64
 4   FoodCourt                  8693 non-null   float64
 5   ShoppingMall               8485 non-null   float64
 6   Spa                        8693 non-null   float64
 7   VRDeck                     8693 non-null   float64
 8   Transported                8693 non-null   bool   
 9   TotalSpending              8693 non-null   float64
 10  AvgSpending                8693 non-null   float64
 11  HomePlanet_Earth           8693 non-null   float64
 12  HomePlanet_Europa          8693 non-null   float64
 13  HomePlanet_Mars            8693 non-null   float

In [83]:
df_test_processed.isnull().sum()

CryoSleep                     0
Age                           0
VIP                           0
RoomService                   0
FoodCourt                     0
ShoppingMall                 98
Spa                           0
VRDeck                        0
TotalSpending                 0
AvgSpending                   0
HomePlanet_Earth              0
HomePlanet_Europa             0
HomePlanet_Mars               0
Destination_55 Cancri e       0
Destination_PSO J318.5-22     0
Destination_TRAPPIST-1e       0
dtype: int64

In [84]:
df_test_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  4277 non-null   float64
 1   Age                        4277 non-null   float64
 2   VIP                        4277 non-null   float64
 3   RoomService                4277 non-null   float64
 4   FoodCourt                  4277 non-null   float64
 5   ShoppingMall               4179 non-null   float64
 6   Spa                        4277 non-null   float64
 7   VRDeck                     4277 non-null   float64
 8   TotalSpending              4277 non-null   float64
 9   AvgSpending                4277 non-null   float64
 10  HomePlanet_Earth           4277 non-null   float64
 11  HomePlanet_Europa          4277 non-null   float64
 12  HomePlanet_Mars            4277 non-null   float64
 13  Destination_55 Cancri e    4277 non-null   float

In [85]:
#correlation_matrix_train = df_train_processed.corr()
#print(correlation_matrix_train)

In [86]:
#print("\nCorrelations with Transported:")
#print(correlation_matrix_train["Transported"].sort_values(ascending=False))

In [87]:
# # Heatmap visualization
# import seaborn as sns
# import matplotlib.pyplot as plt

# plt.figure(figsize=(12, 8))
# sns.heatmap(correlation_matrix_train, annot=True, cmap='coolwarm', center=0)
# plt.title('Correlation Matrix Heatmap')
# plt.xticks(rotation=45, ha='right')
# plt.yticks(rotation=0)
# plt.tight_layout()
# plt.show()

In [88]:
# Prepare training data
X = df_train_processed.drop('Transported', axis=1)
y = df_train_processed['Transported']  # Binary classification target

In [89]:
# Save PassengerId from test data for submission
test_ids = df_test['PassengerId']
X_test = df_test_processed

print("Training features shape:", X.shape)
print("Training target shape:", y.shape)
print("Test features shape:", X_test.shape)

# Display feature information
print("\nFeature columns:", X.columns.tolist())
print("\nFirst few rows of training features:")
print(X.head())

Training features shape: (8693, 16)
Training target shape: (8693,)
Test features shape: (4277, 16)

Feature columns: ['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpending', 'AvgSpending', 'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars', 'Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e']

First few rows of training features:
   CryoSleep       Age       VIP  RoomService  FoodCourt  ShoppingMall  \
0   -0.73277  0.711945 -0.153063    -0.333105  -0.281027           0.0   
1   -0.73277 -0.334037 -0.153063    -0.168073  -0.275387          25.0   
2   -0.73277  2.036857  6.533255    -0.268001   1.959998           0.0   
3   -0.73277  0.293552 -0.153063    -0.333105   0.523010         371.0   
4   -0.73277 -0.891895 -0.153063     0.125652  -0.237159         151.0   

        Spa    VRDeck  TotalSpending  AvgSpending  HomePlanet_Earth  \
0 -0.270626 -0.263003      -0.465002    -0.465002               0

In [90]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print("\nData split shapes:")
print("Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Test set shape:", X_test.shape)


Data split shapes:
Training set shape: (6954, 16)
Validation set shape: (1739, 16)
Test set shape: (4277, 16)


In [91]:
### Train

In [92]:
# Cell 10: Train XGBoost model
import xgboost as xgb

In [93]:
# Define hyperparameters
hyperparameters = {
    'n_estimators': 500,
    'max_depth': 5,
    'learning_rate': 0.01,
    'subsample': 0.8,
    'colsample_bytree': 0.9,
    'random_state': 42,
    'tree_method': 'hist',
    'objective': 'binary:logistic',  # For binary classification
    'eval_metric': ['logloss', 'error'],  # Track both log loss and classification error
    'early_stopping_rounds': 20,
    'verbosity': 1
}

In [94]:
# Create and train the model
model = xgb.XGBClassifier(**hyperparameters)

# Fit with evaluation set
model.fit(
    X_train, 
    y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    verbose=True
)

[0]	validation_0-logloss:0.68891	validation_0-error:0.38237	validation_1-logloss:0.68900	validation_1-error:0.38183
[1]	validation_0-logloss:0.68479	validation_0-error:0.22433	validation_1-logloss:0.68497	validation_1-error:0.22542
[2]	validation_0-logloss:0.68074	validation_0-error:0.21355	validation_1-logloss:0.68096	validation_1-error:0.21219
[3]	validation_0-logloss:0.67704	validation_0-error:0.20736	validation_1-logloss:0.67743	validation_1-error:0.20989
[4]	validation_0-logloss:0.67316	validation_0-error:0.20837	validation_1-logloss:0.67361	validation_1-error:0.21104
[5]	validation_0-logloss:0.66939	validation_0-error:0.20823	validation_1-logloss:0.66992	validation_1-error:0.20932
[6]	validation_0-logloss:0.66562	validation_0-error:0.20650	validation_1-logloss:0.66619	validation_1-error:0.20874
[7]	validation_0-logloss:0.66200	validation_0-error:0.20636	validation_1-logloss:0.66269	validation_1-error:0.20989
[8]	validation_0-logloss:0.65834	validation_0-error:0.20506	validation_1

In [95]:
# Make predictions on validation set
y_pred = model.predict(X_val)

In [96]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Print model performance
print("\nModel Performance on Validation Set:")
print("====================================")
print("\nClassification Report:")
print(classification_report(y_val, y_pred))


Model Performance on Validation Set:

Classification Report:
              precision    recall  f1-score   support

       False       0.82      0.75      0.79       863
        True       0.78      0.84      0.81       876

    accuracy                           0.80      1739
   macro avg       0.80      0.80      0.80      1739
weighted avg       0.80      0.80      0.80      1739



In [97]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))



Confusion Matrix:
[[651 212]
 [139 737]]


In [98]:
print("\nAccuracy Score:", accuracy_score(y_val, y_pred))



Accuracy Score: 0.7981598619896493


In [99]:
import seaborn as sns
# Feature importance visualization
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# # Plot feature importance
# plt.figure(figsize=(12, 6))
# sns.barplot(x='importance', y='feature', data=feature_importance)
# plt.title('Feature Importance')
# plt.xlabel('Importance Score')
# plt.ylabel('Features')
# plt.tight_layout()
# plt.show()
print(feature_importance)

                      feature  importance
9                 AvgSpending    0.297857
8               TotalSpending    0.268369
10           HomePlanet_Earth    0.173307
4                   FoodCourt    0.062821
6                         Spa    0.041132
7                      VRDeck    0.034233
0                   CryoSleep    0.030023
12            HomePlanet_Mars    0.024751
11          HomePlanet_Europa    0.014971
5                ShoppingMall    0.014094
3                 RoomService    0.013842
1                         Age    0.009048
15    Destination_TRAPPIST-1e    0.008740
13    Destination_55 Cancri e    0.005589
14  Destination_PSO J318.5-22    0.001187
2                         VIP    0.000038


In [100]:
# Make predictions on test set
test_predictions = model.predict(X_test)

In [101]:
X.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalSpending,AvgSpending,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,-0.73277,0.711945,-0.153063,-0.333105,-0.281027,0.0,-0.270626,-0.263003,-0.465002,-0.465002,0.0,1.0,0.0,0.0,0.0,1.0
1,-0.73277,-0.334037,-0.153063,-0.168073,-0.275387,25.0,0.217158,-0.224205,-0.204939,-0.204939,1.0,0.0,0.0,0.0,0.0,1.0
2,-0.73277,2.036857,6.533255,-0.268001,1.959998,0.0,5.695623,-0.219796,3.332796,3.332796,0.0,1.0,0.0,0.0,0.0,1.0
3,-0.73277,0.293552,-0.153063,-0.333105,0.52301,371.0,2.687176,-0.092818,1.292526,1.292526,0.0,1.0,0.0,0.0,0.0,1.0
4,-0.73277,-0.891895,-0.153063,0.125652,-0.237159,151.0,0.231374,-0.26124,-0.121178,-0.121178,1.0,0.0,0.0,0.0,0.0,1.0


In [102]:
# Create submission file with correct format
submission = pd.DataFrame({
    'PassengerId': test_ids,
    'Transported': test_predictions.astype(bool)  # Convert to boolean as per submission format
})

In [103]:
# Save processed training data with target
df_train_processed_with_target = X.copy()
df_train_processed_with_target['Transported'] = y


In [104]:
df_train_processed_with_target.to_csv(f'data-out/train-processed-{current_date}.csv', index=False)

In [105]:
#df_train_processed_with_target.to_csv(f'data-out/train-processed-{current_date}.csv', index=False)

# Save processed test data
X_test.to_csv(f'data-out/test-processed-{current_date}.csv', index=False)


In [106]:
# Save submission file
submission.to_csv(f'data-out/predictions-{current_date}.csv', index=False)
print(f"Saved submission to: data-out/predictions-{current_date}.csv")


Saved submission to: data-out/predictions-2024-12-26.csv


In [107]:
# Print submission format verification
print("\nSubmission Format Verification:")
print("Shape:", submission.shape)
print("\nFirst few rows:")
print(submission.head())
print("\nValue counts for Transported:")
print(submission['Transported'].value_counts(normalize=True))

# Print feature importance summary
print("\nTop 10 most important features:")
print(feature_importance.head(10))

# Optional: Save feature importance
feature_importance.to_csv(f'data-out/feature_importance-{current_date}.csv', index=False)



Submission Format Verification:
Shape: (4277, 2)

First few rows:
  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01         True

Value counts for Transported:
Transported
True     0.55249
False    0.44751
Name: proportion, dtype: float64

Top 10 most important features:
              feature  importance
9         AvgSpending    0.297857
8       TotalSpending    0.268369
10   HomePlanet_Earth    0.173307
4           FoodCourt    0.062821
6                 Spa    0.041132
7              VRDeck    0.034233
0           CryoSleep    0.030023
12    HomePlanet_Mars    0.024751
11  HomePlanet_Europa    0.014971
5        ShoppingMall    0.014094


In [108]:
# Optional: Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 most important features:")
print(feature_importance.head(10))


Top 10 most important features:
              feature  importance
9         AvgSpending    0.297857
8       TotalSpending    0.268369
10   HomePlanet_Earth    0.173307
4           FoodCourt    0.062821
6                 Spa    0.041132
7              VRDeck    0.034233
0           CryoSleep    0.030023
12    HomePlanet_Mars    0.024751
11  HomePlanet_Europa    0.014971
5        ShoppingMall    0.014094
