The 9 new features (total, min, std, max, min, median, ptp, q25, q75) are borrowed from PS4E5| OpenFE + Blending + Explain https://www.kaggle.com/code/trupologhelper/ps4e5-openfe-blending-explain. As discussed in this post https://www.kaggle.com/competitions/playground-series-s4e5/discussion/500032, we have a large dataset, therefore train-test-split is enough for cross validation. I learned how to use Optuna to do hyperparameter tuning from PS4E4 🏆| XGBoost+LIGHTGBM+CatBoost😊😊😊 https://www.kaggle.com/code/aaachen/ps4e4-xgboost-lightgbm-catboost in the last playground competition PS4E4 Regression with an Abalone Dataset.

To choose lgbm parameters and ranges: https://www.kaggle.com/code/aspillai/flood-prediction-regression-lightgbm-0-86931#Training-CV and https://www.kaggle.com/code/aspillai/flood-prediction-regression-lgb-xgb-cat-0-86933#Model-Training.

# 1. Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from warnings import filterwarnings;
filterwarnings('ignore');

from scipy import stats

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import optuna
from optuna.samplers import TPESampler

In [None]:
import logging
import lightgbm as lgb

class CustomLogger:
    def init(self):
        self.logger = logging.getLogger('lightgbm_custom')
        self.logger.setLevel(logging.ERROR)

    def info(self, message):
        self.logger.info(message)

    def warning(self, message):
        # Suppress warnings by not doing anything
        pass

    def error(self, message):
        self.logger.error(message)


l = CustomLogger()
l.init()
lgb.register_logger(l)

# 2. Load data

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv')
original = pd.read_csv('/kaggle/input/original/flood.csv')

In [None]:
train.drop('id', axis=1, inplace=True)
train = pd.concat([train, original], axis=0)
train.reset_index(inplace=True, drop=True)

test_ID = test['id']
test.drop('id', axis=1, inplace=True)

# 3. Feature Engineering

In [None]:
BASE_FEATURES = test.columns
def add_features(df):
    
    # These statistical features capture various aspects of the distribution and central tendencies of the base features for each row, providing additional insights for the model.
    df['total'] = df[BASE_FEATURES].sum(axis=1)
    df['mean'] = df[BASE_FEATURES].mean(axis=1)
    df['std'] = df[BASE_FEATURES].std(axis=1)
    df['max'] = df[BASE_FEATURES].max(axis=1)
    df['min'] = df[BASE_FEATURES].min(axis=1)
    df['median'] = df[BASE_FEATURES].median(axis=1)
    df['ptp'] = df[BASE_FEATURES].values.ptp(axis=1)
    df['q25'] = df[BASE_FEATURES].quantile(0.25, axis=1)
    df['q75'] = df[BASE_FEATURES].quantile(0.75, axis=1)
    
    return df

train = add_features(train)
test = add_features(test)

In [None]:
train_index = train.index
test_index = test.index

scaler = StandardScaler()
scaler.fit(pd.concat([train.drop(columns=['FloodProbability']), test]))

train_scaled = pd.DataFrame(scaler.transform(train.drop(columns=['FloodProbability'])), 
                            columns=train.drop(columns=['FloodProbability']).columns, 
                            index=train_index)
test_scaled = pd.DataFrame(scaler.transform(test), 
                           columns=test.columns, 
                           index=test_index)

train = pd.concat([train_scaled, train['FloodProbability']], axis=1)
test = test_scaled

# 4. Split data

In [None]:
X_train = train.drop(columns=['FloodProbability'])
y_train = train['FloodProbability']

In [None]:
X_test = test

In [None]:
# stratify parameter keeps the ratio of FloodProbability same all across the Dataset
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

# 5. Fine-tune models

In [None]:
def evaluation_metric(y, y_pred):
    r2 = r2_score(y, y_pred)
    return r2

LightGBM

1. gbdt. Finding the optimal parameters is painstaking. I start with "learning_rate"(), then add "num_leaves"(0.87088), then "min_data_in_leaf" and "max_depth"(0.870948), then "n_estimators", "min_split_gain"(0.87083), "lambda_l1", "lambda_l2","colsample_bytree" and "subsample"(0.870706). The first three runs each takes about 1-2 hours. The rest much longer. As said in the lgbm parameters tuning page https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html, the most important parameters are "num_leaves", "min_data_in_leaf" and "max_depth". Adding "learning_rate" will capture the most important factors in here. Therefore the tuning is enough.

hyperparameter tuning:
https://www.kaggle.com/code/zgrdenizelik/lgbm-0-86926-r2-score-explained#Hyperparameters-tuning

In [None]:
def lgbm_objective(trial):
        params = {
            "objective": "regression",
            "boosting": "gbdt",
            "random_state": 42,
            "n_jobs": -1,
            "device": 'gpu',
            'learning_rate' :  0.012, 
            'n_estimators': 2000,
            #
            "max_depth": trial.suggest_int("max_depth", 2, 13), 
            "num_leaves": trial.suggest_int("num_leaves", 2, 256),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 140),
            "lambda_l1": trial.suggest_float("lambda_l1", 0, 1),
            "lambda_l2": trial.suggest_float("lambda_l2", 0, 1), 
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "min_split_gain": trial.suggest_float("min_split_gain", 0, 1),                                     
        }    
        
        lgbm_reg = LGBMRegressor(**params)
        
        lgbm_reg.fit(X_tr, y_tr, eval_set=[(X_val, y_val)])
        
        val_scores = evaluation_metric(y_val, lgbm_reg.predict(X_val))
        return val_scores
    
# Set up the sampler for Optuna optimization
sampler = TPESampler(seed=42)  # Using Tree-structured Parzen Estimator sampler for optimization    
    
lgbm_study = optuna.create_study(direction="maximize", study_name="LGBMRegressor", sampler=sampler)    

In [None]:
TUNE = False
if TUNE:
    # Run the optimization process
    lgbm_study.optimize(lambda trial: lgbm_objective(trial), n_trials=100)

    # Get the best parameters after optimization
    lgbm_best_params = lgbm_study.best_params

    print('='*50)
    print(lgbm_best_params)

In [None]:
# cv 0.870940 parameters
lgbm_params_1 = {
    "objective": "regression",
    "boosting": "gbdt",
    "random_state": 42,
    "n_jobs": -1,
    "device": 'gpu',
    'learning_rate' :  0.012, 
    'n_estimators': 2000,
    #
    'max_depth': 12, 
    'num_leaves': 109, 
    'min_data_in_leaf': 88, 
    'lambda_l1': 0.5296506093279638, 
    'lambda_l2': 0.5884899855294714, 
    'colsample_bytree': 0.6804587201173778, 
    'subsample': 0.7119148975720027, 
    'min_split_gain': 0.00023805431519668746
}

In [None]:
lgbm_reg_1 = LGBMRegressor(**lgbm_params_1)
lgbm_reg_1.fit(X_tr, y_tr)
evaluation_metric(y_val, lgbm_reg_1.predict(X_val))

In [None]:
feature_importance = lgbm_reg_1.feature_importances_

feature_names = X_tr.columns

sorted_indices = feature_importance.argsort()
sorted_importance = feature_importance[sorted_indices]
sorted_features = feature_names[sorted_indices]

# Plot feature importance
plt.figure(figsize=(12, 8))
colors = plt.cm.Paired.colors[:len(sorted_features)]  
plt.barh(sorted_features, sorted_importance, color=colors)
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('LightGBM Feature Importance', fontsize=14)
plt.gca().invert_yaxis() 

for i, v in enumerate(sorted_importance):
    plt.text(v + 0.02, i, f'{v:.2f}', color='black', va='center', fontsize=10)

plt.tight_layout()  
plt.show()

Refit model on training data

In [None]:
lgbm_reg_1.fit(X_train, y_train)

# 6. Prediction

In [None]:
pred = lgbm_reg_1.predict(X_test)

In [None]:
sub = pd.DataFrame()
sub['id'] = test_ID
sub['FloodProbability'] = pred

In [None]:
sub.to_csv('/kaggle/working/submission.csv',index=False)
sub.head()