In [10]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

for df in [train, test]:
    df[['Deck', _, 'Side']] = df['Cabin'].str.split('/', expand=True)

spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for df in [train, test]:
    df['TotalSpending'] = np.log1p(df[spend_cols].sum(axis=1))
    df['SpendingWhileAwake'] = (~df['CryoSleep'].fillna(False)) * df[spend_cols].sum(axis=1)

for df in [train, test]:
    df['VIP*Spend'] = df['VIP'].fillna(False).astype(int) * df['TotalSpending']
    df['Age*Spend'] = df['Age'].fillna(df['Age'].median()) * df['TotalSpending']

drop_cols = ['PassengerId', 'Name', 'Cabin']
X = train.drop(columns=drop_cols + ['Transported'])
y = train['Transported'].map({True: 1, False: 0})
X_test = test.drop(columns=drop_cols)

num_cols = X.select_dtypes(include='number').columns
cat_cols = X.select_dtypes(exclude='number').columns

for col in num_cols:
    X[col].fillna(X[col].median(), inplace=True)
    X_test[col].fillna(X[col].median(), inplace=True)

for col in cat_cols:
    X[col].fillna(X[col].mode()[0], inplace=True)
    X_test[col].fillna(X[col].mode()[0], inplace=True)

X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)
X_test, X = X_test.align(X, join='left', axis=1, fill_value=0)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.03,
    num_leaves=64,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

val_preds = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, val_preds))

test_preds = model.predict(X_test)
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": test_preds.astype(bool)
})
submission.to_csv("submission5.csv", index=False)

[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000468 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2199
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230
Validation Accuracy: 0.7912593444508338


ImportError: cannot import name 'SequenceNotStr' from 'pandas._typing' (/Users/chaseungjun/anaconda3/lib/python3.11/site-packages/pandas/_typing.py)

In [1]:
0.80032

0.80032