In [81]:
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.metrics import *

from IPython.display import clear_output, display
from tqdm import tqdm, trange

import optuna
from optuna.samplers import TPESampler

pd.set_option('display.max_columns', None)

In [2]:
df_raw = {
    "train":pd.read_csv("/kaggle/input/spaceship-titanic/train.csv"),
    "test":pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
}

In [3]:
df_raw["train"].head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
df_raw["train"].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
df_raw["train"].describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [19]:
def checking(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    bool_cols = [col for col in df.columns if df[col].dtype == bool]
    df[bool_cols] = df[bool_cols].astype(np.int8) 
    total = len(df)
    check_df = pd.DataFrame(df.isnull().sum(), columns=['#NULLS'])      # Number of NULL columns
    check_df['%NULLS'] = round((check_df['#NULLS']/total)*100, 5)       # Percent of NULL columns
    check_df['#Unique_Valus'] = df.nunique()                            # Number of unique values

    cat_cols = [col for col in df.columns if (df[col].dtype == 'object') or df[col].dtype == 'category']
    uniques = []
    
    # List all unique values if 'object' type
    # Otherwise the range of values
    for col in df.columns:
        if col in cat_cols:
            uniques.append(set(df[col].dropna()))
        else:
            uniques.append(df[col].max() - df[col].min())

    check_df['Unique_Values/Range'] = uniques
    check_df["Dtypes"] = df.dtypes

    return check_df

checking(df_raw["train"])

Unnamed: 0,#NULLS,%NULLS,#Unique_Valus,Unique_Values/Range,Dtypes
PassengerId,0,0.0,8693,"{2940_01, 1402_01, 3915_01, 5750_01, 5414_01, ...",object
HomePlanet,201,2.31221,3,"{Europa, Mars, Earth}",object
CryoSleep,217,2.49626,2,"{False, True}",object
Cabin,199,2.2892,6560,"{F/1189/P, G/1148/S, F/1661/S, E/358/S, F/1112...",object
Destination,182,2.09364,3,"{TRAPPIST-1e, 55 Cancri e, PSO J318.5-22}",object
Age,179,2.05913,80,79.0,float64
VIP,203,2.33521,2,"{False, True}",object
RoomService,181,2.08214,1273,14327.0,float64
FoodCourt,183,2.10514,1507,29813.0,float64
ShoppingMall,208,2.39273,1115,23492.0,float64


In [37]:
def feature_engineering(df):
    df = {
        "train":df["train"].copy(),
        "test":df["test"].copy()
    }
    for tt in df.keys():
        # Разделить Cabin на три признака
        df[tt].fillna({"Cabin":"nan/-1/nan"}, inplace=True)
        df[tt][["Deck", "CabinNum", "Num"]] = df[tt]["Cabin"].str.split("/", expand=True)
        df[tt]["CabinNum"] = df[tt]["CabinNum"].astype(np.int32)
        bins = [-1, 300, 600, 900, 1200, 1500, 1800, np.inf]
        df[tt]['CabinNumBin'] = pd.cut(df[tt]['CabinNum'], bins=bins, labels=False).astype(object)
        df[tt].loc[df[tt]["Deck"]=='nan', ["Deck", "CabinNum", "Num", "CabinNumBin"]] = np.nan
    
        # Разделить Name на имя и фамилию
        df[tt].fillna({"Name":"nan nan"}, inplace=True)
        df[tt][["FirstName", "SecondName"]] = df[tt]["Name"].str.split(expand=True)
    
        num_feats = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
        # Сумма денежных затрат / тратил ли пассажир деньги
        df[tt]['TotalSpent']=df[tt][num_feats].sum(axis=1)
        df[tt]['NoSpent']=(df[tt]['TotalSpent']==0).astype(int)
    
        # Извлечь группу пассажира
        df[tt]['Group'] = df[tt]['PassengerId'].apply(lambda x: x.split('_')[0]).astype(np.int32)
        
    for tt in df.keys():
        # Размер семьи
        df[tt]['FamilySize']=df[tt]['SecondName'].map(
            lambda x: pd.concat(
                [df["train"]['SecondName'],df["test"]['SecondName']]
            ).value_counts()[x]
        )

        # Вернуть значения nan
        df[tt].loc[df[tt]['SecondName']=='nan', ['FirstName', 'SecondName', 'FamilySize']] = np.nan        
    
        # Размер группы
        df[tt]['GroupSize']=df[tt]['Group'].map(
            lambda x: pd.concat(
                [df["train"]['Group'], df["test"]['Group']]
            ).value_counts()[x]
        )
        
        df[tt].drop("PassengerId", axis=1, inplace=True)
    
    return df

def missing_values(df):
    df = {
        "train":df["train"].copy(),
        "test":df["test"].copy()
    }
    concat_df = pd.concat([df["train"].drop("Transported", axis=1), df["test"]])
    cat_cols = [col for col in df["test"].columns if df["test"][col].dtype in ['object', 'category']] + ["CabinNum"]
    num_cols = [col for col in df["test"].columns if df["test"][col].dtype in [int, float]]
    for tt in df.keys():
        # Категориальные признаки заполним модой
        for col in cat_cols:
            mode_value = concat_df[col].mode()[0]
            df[tt].fillna({col:mode_value}, inplace=True)
        # Остальные - средним
        for col in num_cols:
            median_value = concat_df[col].median()
            df[tt].fillna({col:median_value}, inplace=True)
            
        df[tt]["CabinNum"] = df[tt]["CabinNum"].astype(np.int32) 
            
    return df

def type_assignment(df):
    cat_cols = ["CryoSleep", "VIP", "NoSpent"]
    for tt in df.keys():
        for col in cat_cols:
            df[tt][col] = df[tt][col].astype(object)
            
        df[tt]["CabinNumBin"] = df[tt]["CabinNumBin"].astype(str).astype(object)
            
    return df

In [38]:
%%time
df = feature_engineering(df_raw)
df = missing_values(df)
df = type_assignment(df)

CPU times: user 9.52 ms, sys: 3 µs, total: 9.53 ms
Wall time: 9.25 ms


In [39]:
checking(df["train"])

Unnamed: 0,#NULLS,%NULLS,#Unique_Valus,Unique_Values/Range,Dtypes
HomePlanet,0,0.0,3,"{Europa, Mars, Earth}",object
CryoSleep,0,0.0,2,"{False, True}",object
Cabin,0,0.0,6561,"{F/1189/P, G/1148/S, F/1661/S, E/358/S, F/1112...",object
Destination,0,0.0,3,"{TRAPPIST-1e, 55 Cancri e, PSO J318.5-22}",object
Age,0,0.0,80,79.0,float64
VIP,0,0.0,2,"{False, True}",object
RoomService,0,0.0,1273,14327.0,float64
FoodCourt,0,0.0,1507,29813.0,float64
ShoppingMall,0,0.0,1115,23492.0,float64
Spa,0,0.0,1327,22408.0,float64


In [29]:
df["train"].head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,CabinNum,Num,CabinNumBin,FirstName,SecondName,TotalSpent,NoSpent,Group,FamilySize,GroupSize
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P,0.0,Maham,Ofracculy,0.0,1,1,3.0,1
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S,0.0,Juanna,Vines,736.0,0,2,4.0,1
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S,0.0,Altark,Susent,10383.0,0,3,7.0,2
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S,0.0,Solam,Susent,5176.0,0,3,7.0,2
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S,0.0,Willy,Santantines,1091.0,0,4,9.0,1


### Modeling

In [73]:
def objective_catboost(trial):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    catboost_params = {
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 5e-1, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float(';2_leaf_reg', 1.0, 10.0),
        'bootstrap_type': 'Bernoulli',
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 200),
        'random_strength': trial.suggest_float('random_strength', 0.0, 1.0),
        'n_estimators': 1000,
        'early_stopping_rounds': 200,
        'eval_metric':'Accuracy',
        'objective': 'Logloss',
        'cat_features': cat_features
    }
    
    clf = CatBoostClassifier(**catboost_params, verbose=200)
    clf.fit(X_train, y_train, eval_set=(X_test, y_test))
    preds = clf.predict(X_test)
    acc = accuracy_score(y_test, preds)
    clear_output(wait=True)
    
    return acc

study = optuna.create_study(
    study_name='catboost_study',
    sampler=TPESampler(),
    direction='maximize'
)
study.optimize(objective_catboost, n_trials=100)

[I 2024-10-01 18:25:59,194] Trial 99 finished with value: 0.8102357676825762 and parameters: {'learning_rate': 0.1350515131608797, 'depth': 8, ';2_leaf_reg': 6.698348490196805, 'min_data_in_leaf': 97, 'random_strength': 0.3934461171174981}. Best is trial 57 with value: 0.8378378378378378.


In [74]:
print(study.best_params)

{'learning_rate': 0.14542015362291874, 'depth': 9, ';2_leaf_reg': 7.8389363230528195, 'min_data_in_leaf': 20, 'random_strength': 0.79416319211319}


In [80]:
X_lgbm = X.copy()
X_lgbm[cat_features] = X_lgbm[cat_features].astype('category')

def objective_lgbm(trial):
    clear_output(wait=True)
    
    X_train, X_test, y_train, y_test = train_test_split(X_lgbm, y, test_size=0.2, random_state=42)
    
    lgb_params = {
        'objective': 'classification',
        'metric': 'accuracy',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True),
        'max_depth': trial.suggest_int('max_depth', 5, 48),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-3, 5.0, log=True),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3, 2.0, log=True),
        'max_bin': trial.suggest_int('max-bin', 128, 4096, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 200),
    }
    
    lgb = LGBMClassifier(**lgb_params, verbose=-1, n_estimators=1000)
    lgb.fit(X_train, y_train)
    preds = lgb.predict(X_test)
    acc = accuracy_score(y_test, preds)
    return acc

study_lgb = optuna.create_study(sampler=TPESampler(), direction='maximize')
study_lgb.optimize(objective_lgbm, n_trials=100)

[I 2024-10-01 18:43:55,526] Trial 99 finished with value: 0.8067855089131685 and parameters: {'learning_rate': 0.023491412660411944, 'max_depth': 23, 'num_leaves': 260, 'lambda_l1': 3.2677700986548928, 'bagging_fraction': 0.6190769187962877, 'feature_fraction': 0.8479560358618055, 'min_child_weight': 0.4289837575850716, 'max-bin': 205, 'min_data_in_leaf': 152}. Best is trial 83 with value: 0.8125359401955147.


In [82]:
print(study_lgb.best_params)

{'learning_rate': 0.031570630548169665, 'max_depth': 35, 'num_leaves': 48, 'lambda_l1': 1.5087638666381897, 'bagging_fraction': 0.7264272503098088, 'feature_fraction': 0.9139516903089234, 'min_child_weight': 0.8894170152575246, 'max-bin': 195, 'min_data_in_leaf': 156}


In [87]:
X_xg = X.copy()
X_xg[cat_features] = X_xg[cat_features].astype('category')

def objective_xgboost(trial):
    X_train, X_test, y_train, y_test = train_test_split(X_xg, y, test_size=0.2, random_state=42)
    
    xgboost_params = {
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 5e-1, log=True),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'n_estimators': 1000,
        'early_stopping_rounds': 200,
    }
    
    xgb = XGBClassifier(**xgboost_params, enable_categorical=True, verbosity=1, random_state=42)
    xgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=100)
    preds = xgb.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(acc)
    clear_output(wait=True)
    
    return acc

study_xg = optuna.create_study(
    study_name='xgboost_study',
    sampler=TPESampler(),
    direction='maximize'
)
study_xg.optimize(objective_xgboost, n_trials=100)

[I 2024-10-01 19:21:29,302] Trial 99 finished with value: 0.80448533640023 and parameters: {'learning_rate': 0.02202005811158307, 'max_depth': 8, 'reg_alpha': 0.009035053026971118, 'reg_lambda': 0.034998513969663525, 'min_child_weight': 6, 'subsample': 0.7408774142869655, 'colsample_bytree': 0.8809656038287424}. Best is trial 41 with value: 0.8131109833237493.


In [88]:
print(study_xg.best_params)

{'learning_rate': 0.07067649919693347, 'max_depth': 8, 'reg_alpha': 0.25003776852432485, 'reg_lambda': 1.2555604193398808, 'min_child_weight': 6, 'subsample': 0.8686596531037193, 'colsample_bytree': 0.9479778423456572}
