In [131]:
import numpy as np
import pandas as pd

train_df = pd.read_csv('dataset/train.csv')

train_df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [132]:
from sklearn.preprocessing import OneHotEncoder
# Let's get a baseline model

# Clean a bit
X = train_df.drop(columns=['Transported'])
y = train_df['Transported']

y = y.fillna(False)

X["VIP"] = X["VIP"].fillna(False).astype(bool)
X["CryoSleep"] = X["CryoSleep"].fillna(False).astype(bool)
X = X.drop(columns=['Name', 'Cabin', 'PassengerId'])


categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

for col in categorical_cols:
    X[col] = X[col].fillna('Unknown')
    
# Initialize OneHotEncoder with handle_unknown='ignore' for unseen categories
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')

# Fit and transform categorical columns
X_categorical_encoded = encoder.fit_transform(X[categorical_cols])

# Get feature names for the encoded columns
feature_names = encoder.get_feature_names_out(categorical_cols)

# Create DataFrame with encoded features
X_categorical_df = pd.DataFrame(X_categorical_encoded, 
                                columns=feature_names, 
                                index=X.index)

# Drop original categorical columns and add encoded ones
X_numerical = X.drop(columns=categorical_cols)
X_numerical.fillna(0, inplace=True)  # Fill NaNs in numerical columns with 0

X = pd.concat([X_numerical, X_categorical_df], axis=1)

print(f"Categorical columns encoded: {categorical_cols}")
print(f"New encoded features: {len(feature_names)}")
print(f"Feature names: {list(feature_names)}")





Categorical columns encoded: ['HomePlanet', 'Destination']
New encoded features: 6
Feature names: ['HomePlanet_Europa', 'HomePlanet_Mars', 'HomePlanet_Unknown', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Destination_Unknown']


  X["VIP"] = X["VIP"].fillna(False).astype(bool)
  X["CryoSleep"] = X["CryoSleep"].fillna(False).astype(bool)


In [133]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Initialize the model
lr = LogisticRegression(random_state=42)

accuracy_scores = cross_val_score(lr, X, y, cv=5, scoring='accuracy')

print("=== BASELINE LOGISTIC REGRESSION RESULTS ===")
print(f"Accuracy:  {accuracy_scores.mean():.4f}")

=== BASELINE LOGISTIC REGRESSION RESULTS ===
Accuracy:  0.7881


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

our baseline is 0.7881. Let's try to improve it with other models.

In [134]:
# Random forest
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')
print(f"Accuracy:  {rf_cv_scores.mean():.4f}")

Accuracy:  0.7870


In [135]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42, learning_rate=0.1)

xgb_cv_scores = cross_val_score(xgb_model, X, y, cv=5, scoring='accuracy')
print(f"Accuracy:  {xgb_cv_scores.mean():.4f}")

Accuracy:  0.7972


In [136]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42, learning_rate=0.1)

gb_cv_scores = cross_val_score(gb_model, X, y, cv=5, scoring='accuracy')
print(f"Accuracy:  {gb_cv_scores.mean():.4f}")

Accuracy:  0.7911


In [137]:
import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(n_estimators=100, random_state=42, learning_rate=0.1, verbose=-1)

lgb_cv_scores = cross_val_score(lgb_model, X, y, cv=5, scoring='accuracy')

print(f"Accuracy:  {lgb_cv_scores.mean():.4f}")

Accuracy:  0.7982


In [138]:
# Let's feature engineer a bit more

X = train_df.drop(columns=['Transported'])
y = train_df['Transported']

y = y.fillna(False)

X["PassengerId_Category"] = X["PassengerId"].apply(lambda x: int(x.split('_')[1]))
X = X.drop(columns=['PassengerId'])

X["HomePlanet"] = X["HomePlanet"].fillna('Unknown')
X["CryoSleep"] = X["CryoSleep"].fillna(False).astype(int)

X["Cabin"] = X["Cabin"].fillna('Unknown/0/Unknown')
X["Cabin_0"] = X["Cabin"].apply(lambda x: x.split('/')[0])
X["Cabin_1"] = X["Cabin"].apply(lambda x: int(x.split('/')[1]))
X["Cabin_2"] = X["Cabin"].apply(lambda x: x.split('/')[2])
X = X.drop(columns=['Cabin'])

X["Destination"] = X["Destination"].fillna('Unknown')
X["VIP"] = X["VIP"].fillna(False).astype(int)
X["RoomService"] = X["RoomService"].fillna(0)
X["FoodCourt"] = X["FoodCourt"].fillna(0)
X["ShoppingMall"] = X["ShoppingMall"].fillna(0)
X["Spa"] = X["Spa"].fillna(0)
X["VRDeck"] = X["VRDeck"].fillna(0)



X['Age'] = train_df['Age'].fillna(train_df['Age'].median())
X['isAdult'] = (X['Age'] >= 18).astype(int)
X['isChild'] = (X['Age'] < 13).astype(int)

X["Name"] = X["Name"].fillna('Unknown Unknown')
X["Surname"] = X["Name"].apply(lambda x: x.split(' ')[0])
X= X.drop(columns=['Name'])

X["HasFamily"] = X["Surname"].map(X["Surname"].value_counts()) > 1 & (X["Surname"] != 'Unknown')
X = X.drop(columns=['Surname'])

# Keep individual spending + total
spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
X['TotalSpending'] = X[spending_cols].sum(axis=1)
X['isRich'] = (X['TotalSpending'] > X['TotalSpending'].quantile(0.8)).astype(int)
X['HasSpending'] = (X['TotalSpending'] > 0).astype(int)

X


  X["CryoSleep"] = X["CryoSleep"].fillna(False).astype(int)
  X["VIP"] = X["VIP"].fillna(False).astype(int)


Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,PassengerId_Category,Cabin_0,Cabin_1,Cabin_2,isAdult,isChild,HasFamily,TotalSpending,isRich,HasSpending
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,1,B,0,P,1,0,True,0.0,0,0
1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1,F,0,S,1,0,True,736.0,0,1
2,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,1,A,0,S,1,0,False,10383.0,1,1
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,2,A,0,S,1,0,True,5176.0,1,1
4,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1,F,1,S,0,0,True,1091.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,0,55 Cancri e,41.0,1,0.0,6819.0,0.0,1643.0,74.0,1,A,98,P,1,0,True,8536.0,1,1
8689,Earth,1,PSO J318.5-22,18.0,0,0.0,0.0,0.0,0.0,0.0,1,G,1499,S,1,0,True,0.0,0,0
8690,Earth,0,TRAPPIST-1e,26.0,0,0.0,0.0,1872.0,1.0,0.0,1,G,1500,S,1,0,True,1873.0,0,1
8691,Europa,0,55 Cancri e,32.0,0,0.0,1049.0,0.0,353.0,3235.0,1,E,608,S,1,0,True,4637.0,1,1


In [139]:
# Let's hot encode again

categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')

# Fit and transform categorical columns
X_categorical_encoded = encoder.fit_transform(X[categorical_cols])

# Get feature names for the encoded columns
feature_names = encoder.get_feature_names_out(categorical_cols)

# Create DataFrame with encoded features
X_categorical_df = pd.DataFrame(X_categorical_encoded, 
                                columns=feature_names, 
                                index=X.index)

# Drop original categorical columns and add encoded ones
X_numerical = X.drop(columns=categorical_cols)

X = pd.concat([X_numerical, X_categorical_df], axis=1)

X

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,PassengerId_Category,Cabin_1,...,Cabin_0_B,Cabin_0_C,Cabin_0_D,Cabin_0_E,Cabin_0_F,Cabin_0_G,Cabin_0_T,Cabin_0_Unknown,Cabin_2_S,Cabin_2_Unknown
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,1,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0,41.0,1,0.0,6819.0,0.0,1643.0,74.0,1,98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8689,1,18.0,0,0.0,0.0,0.0,0.0,0.0,1,1499,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
8690,0,26.0,0,0.0,0.0,1872.0,1.0,0.0,1,1500,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
8691,0,32.0,0,0.0,1049.0,0.0,353.0,3235.0,1,608,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [140]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Initialize the model
lr = LogisticRegression(random_state=42)

accuracy_scores = cross_val_score(lr, X, y, cv=5, scoring='accuracy')

print("=== BASELINE LOGISTIC REGRESSION RESULTS ===")
print(f"Accuracy:  {accuracy_scores.mean():.4f}")

# Worse than before with more features

=== BASELINE LOGISTIC REGRESSION RESULTS ===
Accuracy:  0.7848


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

In [141]:
# Random forest
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')
print(f"Accuracy:  {rf_cv_scores.mean():.4f}")

# Random forest is better than before

Accuracy:  0.7928


In [142]:
import xgboost as xgb

X_lgb = X.drop(columns=[
    'Age',           # Keep isAdult and isChild instead
    'TotalSpending', # Keep individual spending columns + isRich + HasSpending
    'isChild',       # Keep only isAdult (isChild is inverse relationship)
    'PassengerId_Category'  # Likely just noise
])


xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42, learning_rate=0.1)

xgb_cv_scores = cross_val_score(xgb_model, X_lgb, y, cv=5, scoring='accuracy')
print(f"Accuracy:  {xgb_cv_scores.mean():.4f}")

Accuracy:  0.7787


In [143]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42, learning_rate=0.1)

gb_cv_scores = cross_val_score(gb_model, X_lgb, y, cv=5, scoring='accuracy')
print(f"Accuracy:  {gb_cv_scores.mean():.4f}")

Accuracy:  0.7897


In [144]:
import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(n_estimators=100, random_state=42, learning_rate=0.1, verbose=-1)

lgb_cv_scores = cross_val_score(lgb_model, X_lgb, y, cv=5, scoring='accuracy')

print(f"Accuracy:  {lgb_cv_scores.mean():.4f}")



Accuracy:  0.7811


In [145]:
# Let's tune hyperparameters for random forest here
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# Use your feature-engineered dataset from cell 262e2002 and c169b220
# Make sure X and y are from your best Random Forest preprocessing

def objective(trial):
    # Define hyperparameter search space
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    
    # Create model with suggested hyperparameters
    rf_model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        random_state=42,
        n_jobs=-1
    )
    
    # Use cross-validation to evaluate
    cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')
    
    # Return mean accuracy
    return cv_scores.mean()

# Create study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Print best results
print("Best trial:")
print(f"  Score: {study.best_value:.4f}")
print("  Params:")
for key, value in study.best_params.items():
    print(f"    {key}: {value}")

# Train final model with best parameters
best_rf = RandomForestClassifier(
    **study.best_params,
    random_state=42,
    n_jobs=-1
)

final_cv_scores = cross_val_score(best_rf, X, y, cv=5, scoring='accuracy')
print(f"\nFinal optimized Random Forest accuracy: {final_cv_scores.mean():.4f} (+/- {final_cv_scores.std() * 2:.4f})")

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-07-04 01:56:24,539] A new study created in memory with name: no-name-cedf1f57-c292-41ec-a70d-5fdd01c08ae1
[I 2025-07-04 01:56:29,257] Trial 0 finished with value: 0.779138772001686 and parameters: {'n_estimators': 377, 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': None, 'bootstrap': True}. Best is trial 0 with value: 0.779138772001686.
[I 2025-07-04 01:56:30,245] Trial 1 finished with value: 0.7818994422280175 and parameters: {'n_estimators': 80, 'max_depth': 13, 'min_samples_split': 16, 'min_samples_leaf': 10, 'max_features': None, 'bootstrap': True}. Best is trial 1 with value: 0.7818994422280175.
[I 2025-07-04 01:56:31,788] Trial 2 finished with value: 0.7371479846028728 and parameters: {'n_estimators': 305, 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 1 with value: 0.7818994422280175.
[I 2025-07-04 01:56:33,991] Tria

Best trial:
  Score: 0.8000
  Params:
    n_estimators: 349
    max_depth: 20
    min_samples_split: 14
    min_samples_leaf: 8
    max_features: log2
    bootstrap: False

Final optimized Random Forest accuracy: 0.8000 (+/- 0.0316)


Random forest performs best but let's try to feature engineer for lgb.

In [146]:
# Let's feature engineer a bit more

X = train_df.drop(columns=['Transported'])
y = train_df['Transported']

y = y.fillna(False)

X["PassengerId_Category"] = X["PassengerId"].apply(lambda x: int(x.split('_')[1]))
X = X.drop(columns=['PassengerId'])

X["HomePlanet"] = X["HomePlanet"].fillna('Unknown')
X["CryoSleep"] = X["CryoSleep"].fillna(False).astype(int)

X["Cabin"] = X["Cabin"].fillna('Unknown/0/Unknown')
X["Cabin_0"] = X["Cabin"].apply(lambda x: x.split('/')[0])
X["Cabin_1"] = X["Cabin"].apply(lambda x: int(x.split('/')[1]))
X["Cabin_2"] = X["Cabin"].apply(lambda x: x.split('/')[2])
X = X.drop(columns=['Cabin'])

X["Destination"] = X["Destination"].fillna('Unknown')
X["VIP"] = X["VIP"].fillna(False).astype(int)
X["RoomService"] = X["RoomService"].fillna(0)
X["FoodCourt"] = X["FoodCourt"].fillna(0)
X["ShoppingMall"] = X["ShoppingMall"].fillna(0)
X["Spa"] = X["Spa"].fillna(0)
X["VRDeck"] = X["VRDeck"].fillna(0)



X['Age'] = train_df['Age'].fillna(train_df['Age'].median())
X['isAdult'] = (X['Age'] >= 18).astype(int)

X["Name"] = X["Name"].fillna('Unknown Unknown')
X["Surname"] = X["Name"].apply(lambda x: x.split(' ')[0])
X= X.drop(columns=['Name', 'Surname'])

# Keep individual spending + total
spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
X['TotalSpending'] = X[spending_cols].sum(axis=1)
X = X.drop(columns=spending_cols)


X


  X["CryoSleep"] = X["CryoSleep"].fillna(False).astype(int)
  X["VIP"] = X["VIP"].fillna(False).astype(int)


Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,PassengerId_Category,Cabin_0,Cabin_1,Cabin_2,isAdult,TotalSpending
0,Europa,0,TRAPPIST-1e,39.0,0,1,B,0,P,1,0.0
1,Earth,0,TRAPPIST-1e,24.0,0,1,F,0,S,1,736.0
2,Europa,0,TRAPPIST-1e,58.0,1,1,A,0,S,1,10383.0
3,Europa,0,TRAPPIST-1e,33.0,0,2,A,0,S,1,5176.0
4,Earth,0,TRAPPIST-1e,16.0,0,1,F,1,S,0,1091.0
...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,0,55 Cancri e,41.0,1,1,A,98,P,1,8536.0
8689,Earth,1,PSO J318.5-22,18.0,0,1,G,1499,S,1,0.0
8690,Earth,0,TRAPPIST-1e,26.0,0,1,G,1500,S,1,1873.0
8691,Europa,0,55 Cancri e,32.0,0,1,E,608,S,1,4637.0


In [147]:
# Let's hot encode again

categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')

# Fit and transform categorical columns
X_categorical_encoded = encoder.fit_transform(X[categorical_cols])

# Get feature names for the encoded columns
feature_names = encoder.get_feature_names_out(categorical_cols)

# Create DataFrame with encoded features
X_categorical_df = pd.DataFrame(X_categorical_encoded, 
                                columns=feature_names, 
                                index=X.index)

# Drop original categorical columns and add encoded ones
X_numerical = X.drop(columns=categorical_cols)

X = pd.concat([X_numerical, X_categorical_df], axis=1)

X

Unnamed: 0,CryoSleep,Age,VIP,PassengerId_Category,Cabin_1,isAdult,TotalSpending,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_Unknown,...,Cabin_0_B,Cabin_0_C,Cabin_0_D,Cabin_0_E,Cabin_0_F,Cabin_0_G,Cabin_0_T,Cabin_0_Unknown,Cabin_2_S,Cabin_2_Unknown
0,0,39.0,0,1,0,1,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,24.0,0,1,0,1,736.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0,58.0,1,1,0,1,10383.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0,33.0,0,2,0,1,5176.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0,16.0,0,1,1,0,1091.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0,41.0,1,1,98,1,8536.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8689,1,18.0,0,1,1499,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
8690,0,26.0,0,1,1500,1,1873.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
8691,0,32.0,0,1,608,1,4637.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [148]:
import lightgbm as lgb

lgb_model = lgb.LGBMClassifier(n_estimators=100, random_state=42, learning_rate=0.1, verbose=-1)

lgb_cv_scores = cross_val_score(lgb_model, X, y, cv=5, scoring='accuracy')

print(f"Accuracy:  {lgb_cv_scores.mean():.4f}")

# Even a worse score!

Accuracy:  0.7231


In [None]:
# Let's start with different features
X = train_df.drop(columns=['Transported'])
y = train_df['Transported']

y = y.fillna(False)

X["VIP"] = X["VIP"].fillna(False).astype(bool)
X["CryoSleep"] = X["CryoSleep"].fillna(False).astype(bool)
X = X.drop(columns=['Name', 'Cabin', 'PassengerId'])  # Don't extract PassengerId_Category

categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

for col in categorical_cols:
    X[col] = X[col].fillna('Unknown')

# Add ONLY one or two simple features
spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for col in spending_cols:
    X[col] = X[col].fillna(0)

# Add just TotalSpending - keep it simple
X['TotalSpending'] = X[spending_cols].sum(axis=1)

# OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')
X_categorical_encoded = encoder.fit_transform(X[categorical_cols])
feature_names = encoder.get_feature_names_out(categorical_cols)
X_categorical_df = pd.DataFrame(X_categorical_encoded, columns=feature_names, index=X.index)

X_numerical = X.drop(columns=categorical_cols)
X_numerical.fillna(0, inplace=True)

X = pd.concat([X_numerical, X_categorical_df], axis=1)

# Test LightGBM
lgb_model = lgb.LGBMClassifier(n_estimators=100, random_state=42, learning_rate=0.1, verbose=-1)
lgb_cv_scores = cross_val_score(lgb_model, X, y, cv=5, scoring='accuracy')
print(f"LightGBM with minimal features: {lgb_cv_scores.mean():.4f}")

# still a bit worse than before

  X["VIP"] = X["VIP"].fillna(False).astype(bool)
  X["CryoSleep"] = X["CryoSleep"].fillna(False).astype(bool)


LightGBM with minimal features: 0.7944


In [150]:
# Let's tune hyper parameters for LightGBM for it's best feature set

X = train_df.drop(columns=['Transported'])
y = train_df['Transported']

y = y.fillna(False)

X["VIP"] = X["VIP"].fillna(False).astype(bool)
X["CryoSleep"] = X["CryoSleep"].fillna(False).astype(bool)
X = X.drop(columns=['Name', 'Cabin', 'PassengerId'])


categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

for col in categorical_cols:
    X[col] = X[col].fillna('Unknown')
    
# Initialize OneHotEncoder with handle_unknown='ignore' for unseen categories
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')

# Fit and transform categorical columns
X_categorical_encoded = encoder.fit_transform(X[categorical_cols])

# Get feature names for the encoded columns
feature_names = encoder.get_feature_names_out(categorical_cols)

# Create DataFrame with encoded features
X_categorical_df = pd.DataFrame(X_categorical_encoded, 
                                columns=feature_names, 
                                index=X.index)

# Drop original categorical columns and add encoded ones
X_numerical = X.drop(columns=categorical_cols)
X_numerical.fillna(0, inplace=True)  # Fill NaNs in numerical columns with 0

X = pd.concat([X_numerical, X_categorical_df], axis=1)

print(f"Categorical columns encoded: {categorical_cols}")
print(f"New encoded features: {len(feature_names)}")
print(f"Feature names: {list(feature_names)}")

import optuna
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
import numpy as np

# Use your baseline dataset from cell 66319113 (LightGBM's best performing features)
# Make sure X and y are from your baseline preprocessing

def objective(trial):
    # Define hyperparameter search space for LightGBM
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    num_leaves = trial.suggest_int('num_leaves', 10, 300)
    min_child_samples = trial.suggest_int('min_child_samples', 5, 100)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    reg_alpha = trial.suggest_float('reg_alpha', 0, 10)
    reg_lambda = trial.suggest_float('reg_lambda', 0, 10)
    
    # Create model with suggested hyperparameters
    lgb_model = lgb.LGBMClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        num_leaves=num_leaves,
        min_child_samples=min_child_samples,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        random_state=42,
        verbose=-1,
        n_jobs=-1
    )
    
    # Use cross-validation to evaluate
    cv_scores = cross_val_score(lgb_model, X, y, cv=5, scoring='accuracy')
    
    # Return mean accuracy
    return cv_scores.mean()

# Create study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Print best results
print("Best trial:")
print(f"  Score: {study.best_value:.4f}")
print("  Params:")
for key, value in study.best_params.items():
    print(f"    {key}: {value}")

# Train final model with best parameters
best_lgb = lgb.LGBMClassifier(
    **study.best_params,
    random_state=42,
    verbose=-1,
    n_jobs=-1
)

final_cv_scores = cross_val_score(best_lgb, X, y, cv=5, scoring='accuracy')
print(f"\nFinal optimized LightGBM accuracy: {final_cv_scores.mean():.4f} (+/- {final_cv_scores.std() * 2:.4f})")

  X["VIP"] = X["VIP"].fillna(False).astype(bool)
  X["CryoSleep"] = X["CryoSleep"].fillna(False).astype(bool)
[I 2025-07-04 02:03:24,916] A new study created in memory with name: no-name-73047aca-dffa-4969-9a0b-b7db26e10630


Categorical columns encoded: ['HomePlanet', 'Destination']
New encoded features: 6
Feature names: ['HomePlanet_Europa', 'HomePlanet_Mars', 'HomePlanet_Unknown', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'Destination_Unknown']


[I 2025-07-04 02:03:26,364] Trial 0 finished with value: 0.7928246661077256 and parameters: {'n_estimators': 58, 'learning_rate': 0.06462998614849061, 'max_depth': 13, 'num_leaves': 161, 'min_child_samples': 42, 'subsample': 0.575741458724726, 'colsample_bytree': 0.8701438966029721, 'reg_alpha': 8.44799587942865, 'reg_lambda': 0.8086967809021817}. Best is trial 0 with value: 0.7928246661077256.
[I 2025-07-04 02:03:27,278] Trial 1 finished with value: 0.7953549220449302 and parameters: {'n_estimators': 234, 'learning_rate': 0.2743013385159064, 'max_depth': 3, 'num_leaves': 192, 'min_child_samples': 26, 'subsample': 0.7202988426058392, 'colsample_bytree': 0.955784024649541, 'reg_alpha': 5.449955328759412, 'reg_lambda': 2.2343533189215115}. Best is trial 1 with value: 0.7953549220449302.
[I 2025-07-04 02:03:29,259] Trial 2 finished with value: 0.7932839065346473 and parameters: {'n_estimators': 252, 'learning_rate': 0.04328128614313735, 'max_depth': 4, 'num_leaves': 158, 'min_child_sample

Best trial:
  Score: 0.8003
  Params:
    n_estimators: 249
    learning_rate: 0.10618779450804473
    max_depth: 11
    num_leaves: 189
    min_child_samples: 72
    subsample: 0.8578937100568457
    colsample_bytree: 0.7790784600563625
    reg_alpha: 5.33317093462979
    reg_lambda: 2.0619394101271924

Final optimized LightGBM accuracy: 0.8003 (+/- 0.0151)


In [155]:
# Let's train the final model with the best parameters

test_df = pd.read_csv('dataset/test.csv')

test_df["VIP"] = test_df["VIP"].fillna(False).astype(bool)
test_df["CryoSleep"] = test_df["CryoSleep"].fillna(False).astype(bool)
test_df = test_df.drop(columns=['Name', 'Cabin', 'PassengerId'])


categorical_cols = test_df.select_dtypes(include=['object']).columns.tolist()

for col in categorical_cols:
    test_df[col] = test_df[col].fillna('Unknown')

# Initialize OneHotEncoder with handle_unknown='ignore' for unseen categories
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')

# Fit and transform categorical columns
test_categorical_encoded = encoder.fit_transform(test_df[categorical_cols])

# Get feature names for the encoded columns
test_feature_names = encoder.get_feature_names_out(categorical_cols)

# Create DataFrame with encoded features
test_categorical_df = pd.DataFrame(test_categorical_encoded, 
                                    columns=test_feature_names, 
                                    index=test_df.index)

# Drop original categorical columns and add encoded ones
test_numerical = test_df.drop(columns=categorical_cols)
test_numerical.fillna(0, inplace=True)  # Fill NaNs in numerical columns with 0

test_df = pd.concat([test_numerical, test_categorical_df], axis=1)


best_lgb.fit(X, y)
# Predict on the test set
predictions = best_lgb.predict(test_df)
# Prepare submission DataFrame

df_test_original = pd.read_csv('dataset/test.csv')
submission_df = pd.DataFrame({
    'PassengerId': df_test_original["PassengerId"],  # Assuming PassengerId starts from 1
    'Transported': predictions
})
# Save submission to CSV
submission_df.to_csv('submission.csv', index=False)

  test_df["VIP"] = test_df["VIP"].fillna(False).astype(bool)
  test_df["CryoSleep"] = test_df["CryoSleep"].fillna(False).astype(bool)
