In [1]:
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.metrics import *

from IPython.display import clear_output, display
from tqdm import tqdm, trange

import optuna
from optuna.samplers import TPESampler

pd.set_option('display.max_columns', None)

In [2]:
df_raw = {
    "train":pd.read_csv("/kaggle/input/spaceship-titanic/train.csv"),
    "test":pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
}

In [3]:
def checking(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    bool_cols = [col for col in df.columns if df[col].dtype == bool]
    df[bool_cols] = df[bool_cols].astype(np.int8) 
    total = len(df)
    check_df = pd.DataFrame(df.isnull().sum(), columns=['#NULLS'])      # Number of NULL columns
    check_df['%NULLS'] = round((check_df['#NULLS']/total)*100, 5)       # Percent of NULL columns
    check_df['#Unique_Valus'] = df.nunique()                            # Number of unique values

    cat_cols = [col for col in df.columns if (df[col].dtype == 'object') or df[col].dtype == 'category']
    uniques = []
    
    # List all unique values if 'object' type
    # Otherwise the range of values
    for col in df.columns:
        if col in cat_cols:
            uniques.append(set(df[col].dropna()))
        else:
            uniques.append(df[col].max() - df[col].min())

    check_df['Unique_Values/Range'] = uniques
    check_df["Dtypes"] = df.dtypes

    return check_df

checking(df_raw["train"])

Unnamed: 0,#NULLS,%NULLS,#Unique_Valus,Unique_Values/Range,Dtypes
PassengerId,0,0.0,8693,"{5034_01, 9069_04, 0733_01, 3441_01, 0379_01, ...",object
HomePlanet,201,2.31221,3,"{Earth, Europa, Mars}",object
CryoSleep,217,2.49626,2,"{False, True}",object
Cabin,199,2.2892,6560,"{G/1253/S, F/14/P, F/968/P, B/121/S, F/1366/P,...",object
Destination,182,2.09364,3,"{TRAPPIST-1e, 55 Cancri e, PSO J318.5-22}",object
Age,179,2.05913,80,79.0,float64
VIP,203,2.33521,2,"{False, True}",object
RoomService,181,2.08214,1273,14327.0,float64
FoodCourt,183,2.10514,1507,29813.0,float64
ShoppingMall,208,2.39273,1115,23492.0,float64


In [4]:
def feature_engineering(df):
    df = {
        "train":df["train"].copy(),
        "test":df["test"].copy()
    }
    for tt in df.keys():
        # Разделить Cabin на три признака
        df[tt].fillna({"Cabin":"nan/-1/nan"}, inplace=True)
        df[tt][["Deck", "CabinNum", "Num"]] = df[tt]["Cabin"].str.split("/", expand=True)
        df[tt]["CabinNum"] = df[tt]["CabinNum"].astype(np.int32)
        bins = [-1, 300, 600, 900, 1200, 1500, 1800, np.inf]
        df[tt]['CabinNumBin'] = pd.cut(df[tt]['CabinNum'], bins=bins, labels=False).astype(object)
        df[tt].loc[df[tt]["Deck"]=='nan', ["Deck", "CabinNum", "Num", "CabinNumBin"]] = np.nan
    
        # Разделить Name на имя и фамилию
        df[tt].fillna({"Name":"nan nan"}, inplace=True)
        df[tt][["FirstName", "SecondName"]] = df[tt]["Name"].str.split(expand=True)
    
        num_feats = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
        # Сумма денежных затрат / тратил ли пассажир деньги
        df[tt]['TotalSpent']=df[tt][num_feats].sum(axis=1)
        df[tt]['NoSpent']=(df[tt]['TotalSpent']==0).astype(int)
    
        # Извлечь группу пассажира
        df[tt]['Group'] = df[tt]['PassengerId'].apply(lambda x: x.split('_')[0]).astype(np.int32)
        
    for tt in df.keys():
        # Размер семьи
        df[tt]['FamilySize']=df[tt]['SecondName'].map(
            lambda x: pd.concat(
                [df["train"]['SecondName'],df["test"]['SecondName']]
            ).value_counts()[x]
        )

        # Вернуть значения nan
        df[tt].loc[df[tt]['SecondName']=='nan', ['FirstName', 'SecondName', 'FamilySize']] = np.nan        
    
        # Размер группы
        df[tt]['GroupSize']=df[tt]['Group'].map(
            lambda x: pd.concat(
                [df["train"]['Group'], df["test"]['Group']]
            ).value_counts()[x]
        )
        
        df[tt].drop("PassengerId", axis=1, inplace=True)
    
    return df

def missing_values(df):
    df = {
        "train":df["train"].copy(),
        "test":df["test"].copy()
    }
    concat_df = pd.concat([df["train"].drop("Transported", axis=1), df["test"]])
    cat_cols = [col for col in df["test"].columns if df["test"][col].dtype in ['object', 'category']] + ["CabinNum"]
    num_cols = [col for col in df["test"].columns if df["test"][col].dtype in [int, float]]
    for tt in df.keys():
        # Категориальные признаки заполним модой
        for col in cat_cols:
            mode_value = concat_df[col].mode()[0]
            df[tt].fillna({col:mode_value}, inplace=True)
        # Остальные - средним
        for col in num_cols:
            median_value = concat_df[col].median()
            df[tt].fillna({col:median_value}, inplace=True)
            
        df[tt]["CabinNum"] = df[tt]["CabinNum"].astype(np.int32) 
            
    return df

def type_assignment(df):
    cat_cols = ["CryoSleep", "VIP", "NoSpent"]
    for tt in df.keys():
        for col in cat_cols:
            df[tt][col] = df[tt][col].astype(object)
            
        df[tt]["CabinNumBin"] = df[tt]["CabinNumBin"].astype(str).astype(object)
            
    return df

In [5]:
%%time
df = feature_engineering(df_raw)
df = missing_values(df)
df = type_assignment(df)

CPU times: user 1min 12s, sys: 28 ms, total: 1min 12s
Wall time: 1min 12s


  df[tt].fillna({col:mode_value}, inplace=True)
  df[tt].fillna({col:mode_value}, inplace=True)


In [6]:
X = df["train"].drop('Transported', axis=1).copy()
y = df["train"]['Transported'].copy().astype(int)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

cat_features = [col for col in X.columns if X[col].dtype in [object, 'category']]
print(cat_features)

['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name', 'Deck', 'Num', 'CabinNumBin', 'FirstName', 'SecondName', 'NoSpent']


### CatBoost

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

catboost_params = {
    'learning_rate': 0.14542015362291874,
    'depth': 9,
    'l2_leaf_reg': 7.8389363230528195,
    'min_data_in_leaf': 20,
    'random_strength': 0.79416319211319,
    'n_estimators': 1000,
    'early_stopping_rounds': 500,
    'eval_metric':'Accuracy',
    'objective': 'Logloss',
    'cat_features': cat_features
}

cat_clf = CatBoostClassifier(**catboost_params, verbose=200, random_state=42)
cat_clf.fit(X_train, y_train, eval_set=(X_test, y_test))
preds_cat = cat_clf.predict(X_test)
acc_cat = accuracy_score(y_test, preds_cat)

0:	learn: 0.7654587	test: 0.7418056	best: 0.7418056 (0)	total: 55.5ms	remaining: 55.4s
200:	learn: 0.9466494	test: 0.8056354	best: 0.8108108 (195)	total: 10.4s	remaining: 41.5s
400:	learn: 0.9879206	test: 0.8096607	best: 0.8108108 (195)	total: 22s	remaining: 32.9s
600:	learn: 0.9978430	test: 0.8073606	best: 0.8142611 (486)	total: 32.8s	remaining: 21.8s
800:	learn: 0.9992810	test: 0.8010351	best: 0.8142611 (486)	total: 43.4s	remaining: 10.8s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8142610696
bestIteration = 486

Shrink model to first 487 iterations.


In [24]:
print(acc_cat)

0.8142610695802185


### LGBM

In [25]:
X_lgbm = X.copy()
X_lgbm[cat_features] = X_lgbm[cat_features].astype('category')
X_train, X_test, y_train, y_test = train_test_split(X_lgbm, y, random_state=42, test_size=0.2)

lgbm_params = {
    'learning_rate': 0.031570630548169665,
    'max_depth': 35,
    'num_leaves': 48,
    'lambda_l1': 1.5087638666381897,
    'bagging_fraction': 0.7264272503098088,
    'feature_fraction': 0.9139516903089234,
    'min_child_weight': 0.8894170152575246,
    'max_bin': 195,
    'min_data_in_leaf': 156,
    'metric': 'accuracy',
    'boosting_type': 'gbdt'
}

lgbm_clf = LGBMClassifier(**lgbm_params, seed=42)
lgbm_clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
preds_lgbm = lgbm_clf.predict(X_test)
acc_lgbm = accuracy_score(y_test, preds_lgbm)

[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002170 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4316
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230


In [26]:
print(acc_lgbm)

0.7993099482461185


### XGBoost

In [28]:
X_xg = X.copy()
X_xg[cat_features] = X_xg[cat_features].astype('category')
X_train, X_test, y_train, y_test = train_test_split(X_xg, y, random_state=42, test_size=0.2)

xg_params = {
    'learning_rate': 0.07067649919693347,
    'max_depth': 8,
    'reg_alpha': 0.25003776852432485,
    'reg_lambda': 1.2555604193398808,
    'min_child_weight': 6,
    'subsample': 0.8686596531037193,
    'colsample_bytree': 0.9479778423456572,
    'n_estimators': 172,
    'verbosity': 1,
    'enable_categorical': True,
    'early_stopping_rounds': 100
}

clf_xg = XGBClassifier(**xg_params, random_state=42)
clf_xg.fit(X_train, y_train, eval_set=[(X_test, y_test)])
preds_xg = clf_xg.predict(X_test)
acc_xg = accuracy_score(y_test, preds_xg)

print("Accuracy: ", acc_xg)

[0]	validation_0-logloss:0.66233
[1]	validation_0-logloss:0.63595
[2]	validation_0-logloss:0.61351
[3]	validation_0-logloss:0.59271
[4]	validation_0-logloss:0.57407
[5]	validation_0-logloss:0.55762
[6]	validation_0-logloss:0.54329
[7]	validation_0-logloss:0.53012
[8]	validation_0-logloss:0.51828
[9]	validation_0-logloss:0.50769
[10]	validation_0-logloss:0.49828
[11]	validation_0-logloss:0.48961
[12]	validation_0-logloss:0.48183
[13]	validation_0-logloss:0.47473
[14]	validation_0-logloss:0.46867
[15]	validation_0-logloss:0.46312
[16]	validation_0-logloss:0.45796
[17]	validation_0-logloss:0.45382
[18]	validation_0-logloss:0.44991
[19]	validation_0-logloss:0.44546
[20]	validation_0-logloss:0.44151
[21]	validation_0-logloss:0.43722
[22]	validation_0-logloss:0.43399
[23]	validation_0-logloss:0.43087
[24]	validation_0-logloss:0.42873
[25]	validation_0-logloss:0.42621
[26]	validation_0-logloss:0.42359
[27]	validation_0-logloss:0.42130
[28]	validation_0-logloss:0.41895
[29]	validation_0-loglos

In [29]:
from sklearn.linear_model import LogisticRegression

X_ = X.copy()
X_[cat_features] = X_[cat_features].astype('category')
X_train, X_test, y_train, y_test = train_test_split(X_, y, random_state=42, test_size=0.05)

preds_cat_train = cat_clf.predict(X_train)
preds_lgbm_train = lgbm_clf.predict(X_train)
preds_xg_train = clf_xg.predict(X_train)

stacked_predictions_train = np.column_stack(
    (
        cat_clf.predict(X_train),
        lgbm_clf.predict(X_train),
        clf_xg.predict(X_train)
    )
)

stacked_predictions_test = np.column_stack(
    (
        cat_clf.predict(X_test),
        lgbm_clf.predict(X_test),
        clf_xg.predict(X_test)
    )
)




In [30]:
meta_model = LogisticRegression()
meta_model.fit(stacked_predictions_train, y_train)
preds_meta = meta_model.predict(stacked_predictions_test)
acc_meta = accuracy_score(y_test, preds_meta)
print(acc_meta)

0.7954022988505747


In [31]:
print(meta_model.coef_)
print(meta_model.intercept_)

[[ 2.65998232 -0.54629274  2.49548875]]
[-2.29475225]


In [32]:
X_TEST = df["test"].copy()
X_TEST[cat_features] = X_TEST[cat_features].astype('category')

stacked_predictions = np.column_stack(
    (
        cat_clf.predict(X_TEST),
        lgbm_clf.predict(X_TEST),
        clf_xg.predict(X_TEST)
    )
)



In [33]:
y_predicted = meta_model.predict(stacked_predictions)

In [34]:
sample_sub = pd.concat([df_raw["test"].PassengerId, pd.Series(y_predicted).map(bool)], axis=1)
sample_sub = sample_sub.rename(columns={0:'Transported'})
sample_sub.to_csv("submission.csv", index=False)
sample_sub.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
