In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
import warnings
import gc
from sklearn.model_selection import KFold, RepeatedStratifiedKFold
from pandas.errors import PerformanceWarning
from sklearn.metrics import mean_squared_error
from itertools import combinations
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from tqdm import tqdm
import optuna
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e9/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e9/test.csv')
target = 'BeatsPerMinute'
cat_cols = []
num_cols = ['RhythmScore', 'AudioLoudness', 'VocalContent', 'AcousticQuality', 'InstrumentalScore', 
            'LivePerformanceLikelihood', 'MoodScore', 'TrackDurationMs', 'Energy']

for i in range(len(num_cols)):
    for j in range(i, len(num_cols)):
        col1 = num_cols[i]
        col2 = num_cols[j]
        # Create interaction features
        train[f'{col1}_x_{col2}'] = train[col1] * train[col2]
        test[f'{col1}_x_{col2}'] = test[col1] * test[col2]
        # Create ratio features, handle division by zero
        if col1 != col2:
            train[f'{col1}_div_{col2}'] = train[col1] / (train[col2] + 1e-6)
            test[f'{col1}_div_{col2}'] = test[col1] / (test[col2] + 1e-6)



def add_bins(df, column, labels, new_column=None):
    if len(labels) == 4 and new_column is None:
        new_column = f"{column}_quartile"
    if len(labels) == 5 and new_column is None:
        new_column = f"{column}_quintile"
    if len(labels) == 10 and new_column is None:
        new_column = f"{column}_decile"
    

    df[new_column] = pd.cut(
        df[column],
        bins= len(labels),
        labels=False,
        include_lowest=True
    )
    return df[new_column]

for col in num_cols:
    train[f"{col}_quartile"] = add_bins(train, col, [1, 2, 3, 4])
    
for col in num_cols:
    train[f"{col}_decile"] = add_bins(train, col, [1, 2, 3, 4, 5,
                                                   6, 7, 8, 9, 10])
for col in num_cols:
    test[f"{col}_quartile"] = add_bins(test, col,  [1, 2, 3, 4])
    
for col in num_cols:
    test[f"{col}_decile"] = add_bins(test, col, [1, 2, 3, 4, 5,
                                                   6, 7, 8, 9, 10])
Features = train.columns.tolist()
Features.remove(target)
Features.remove('id')

print(Features)
X=train[Features]
y=train[target]
X_test = test[Features]
display(train.shape)

xgb_params = {
    'n_estimators': 620,         
    'max_leaves': 211,            
    'min_child_weight': 1.5,     
    'max_depth': 6,               
    'grow_policy': 'lossguide',   
    'learning_rate': 0.0021858703356597603,      
    'tree_method': 'hist',        
    'subsample': 0.85,            
    'colsample_bylevel': 0.6787051322531533,     
    'colsample_bytree': 0.6843905004927857,       
    'colsample_bynode': 0.442116057736592,     
    'sampling_method': 'uniform',  
    'reg_alpha': 2.5,             
    'reg_lambda': 0.8,            
    'enable_categorical': True,    
    'max_cat_to_onehot': 1,       
    'device': 'cuda',            
    'n_jobs': -1,                 
    'random_state': 0,     
    'verbosity': 0,               
}
lgbm_params = {
    'learning_rate': 0.001502328415098844,
    'num_leaves': 79, 
    'max_depth': 14,
    'feature_fraction': 0.8933016300882094,
    'bagging_fraction': 0.9754103048412501,
    'bagging_freq': 7, 
    'min_child_samples': 40,
    'lambda_l1': 7.10897934678165e-07,
    'lambda_l2': 7.81564014894075e-08,
    'random_state' : 0,
    'n_jobs' : -1,
    'verbosity': -1,
    'n_estimators': 643
}
model = XGBRegressor(**xgb_params)

xgb_models, xgb_scores=[],[]
kf = KFold(n_splits=5, shuffle=True, random_state=0)
for train_idx, val_idx in kf.split(X, y):
        print('Fold:', len(xgb_models) + 1)
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train)
        acc = mean_squared_error(y_val, model.predict(X_val), squared=False)
        xgb_scores.append(acc), xgb_models.append(model)
        print('Accuracy:', acc)
print('XGB ACCURACY: ', np.mean(xgb_scores))
xgb_oof_preds = sum(model.predict(X_val) for model in xgb_models) / len(xgb_models)


def xgb_objective(trial):
    # Suggest weights for the ensemble
    learning_rate=trial.suggest_float("learning_rate", 0.001, 0.12, log=True)
    max_depth = trial.suggest_int("max_depth", 1, 4)
    n_estimators = trial.suggest_int("n_estimators", 200, 1000)

    xgb=XGBRegressor(learning_rate=learning_rate,max_depth=max_depth,n_estimators=n_estimators,random_state=0,n_jobs=-1)
    X_train, X_val, y_train,y_val=train_test_split(X,y,test_size=0.25,random_state=0)
    xgb.fit(X_train, y_train)
    preds=xgb.predict(X_val)
    score = mean_squared_error(y_val, preds, squared=False)
    return score

#xgb_study = optuna.create_study(direction="minimize")
#xgb_study.optimize(xgb_objective, n_trials=100)
#print("Best score:", xgb_study.best_value)
#print("Best params:", xgb_study.best_params)

  df[new_column] = pd.cut(
  df[new_column] = pd.cut(


['RhythmScore', 'AudioLoudness', 'VocalContent', 'AcousticQuality', 'InstrumentalScore', 'LivePerformanceLikelihood', 'MoodScore', 'TrackDurationMs', 'Energy', 'RhythmScore_x_RhythmScore', 'RhythmScore_x_AudioLoudness', 'RhythmScore_div_AudioLoudness', 'RhythmScore_x_VocalContent', 'RhythmScore_div_VocalContent', 'RhythmScore_x_AcousticQuality', 'RhythmScore_div_AcousticQuality', 'RhythmScore_x_InstrumentalScore', 'RhythmScore_div_InstrumentalScore', 'RhythmScore_x_LivePerformanceLikelihood', 'RhythmScore_div_LivePerformanceLikelihood', 'RhythmScore_x_MoodScore', 'RhythmScore_div_MoodScore', 'RhythmScore_x_TrackDurationMs', 'RhythmScore_div_TrackDurationMs', 'RhythmScore_x_Energy', 'RhythmScore_div_Energy', 'AudioLoudness_x_AudioLoudness', 'AudioLoudness_x_VocalContent', 'AudioLoudness_div_VocalContent', 'AudioLoudness_x_AcousticQuality', 'AudioLoudness_div_AcousticQuality', 'AudioLoudness_x_InstrumentalScore', 'AudioLoudness_div_InstrumentalScore', 'AudioLoudness_x_LivePerformanceLike

(524164, 110)

Fold: 1
Accuracy: 26.433417073117017
Fold: 2
Accuracy: 26.513381761371082
Fold: 3
Accuracy: 26.425568573428425
Fold: 4
Accuracy: 26.46788828249542
Fold: 5
Accuracy: 26.46163211293748
XGB ACCURACY:  26.460377560669883


In [3]:
lgbm_model = LGBMRegressor(**lgbm_params)
lgbm_models, lgbm_scores=[],[]
kf = KFold(n_splits=5, shuffle=True, random_state=0)
for train_idx, val_idx in kf.split(X, y):
        print('Fold:', len(lgbm_models) + 1)
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        lgbm_model.fit(X_train, y_train)
        acc = mean_squared_error(y_val, lgbm_model.predict(X_val), squared=False)
        lgbm_scores.append(acc), lgbm_models.append(lgbm_model)
        print('Accuracy:', acc)
print('LGBM ACCURACY: ', np.mean(lgbm_scores))
lgbm_oof_preds = sum(lgbm_model.predict(X_val) for lgbm_model in lgbm_models) / len(lgbm_models)
y_val = y_val

def lgbm_objective(trial):
    # Suggest weights for the ensemble
    learning_rate=trial.suggest_float("learning_rate", 0.001, 0.1, log=True)
    max_depth = trial.suggest_int("max_depth", 1, 10)
    n_estimators = trial.suggest_int("n_estimators", 200, 1000)

    lgbm=LGBMRegressor(learning_rate=learning_rate,max_depth=max_depth,n_estimators=n_estimators,random_state=0,n_jobs=-1)
    X_train, X_val, y_train,y_val=train_test_split(X,y,test_size=0.25,random_state=0)
    lgbm.fit(X_train, y_train)
    preds=lgbm.predict(X_val)
    score = mean_squared_error(y_val, preds, squared=False)
    return score

#lgbm_study = optuna.create_study(direction="minimize")
#lgbm_study.optimize(lgbm_objective, n_trials=100)
#print("Best score:", lgbm_study.best_value)
#print("Best params:", lgbm_study.best_params)

Fold: 1
Accuracy: 26.432145269322692
Fold: 2
Accuracy: 26.514004061495577
Fold: 3
Accuracy: 26.425103096677724
Fold: 4
Accuracy: 26.467946833088693
Fold: 5
Accuracy: 26.46114657758192
LGBM ACCURACY:  26.46006916763332


In [4]:
X_stack = pd.DataFrame({
    'xgb' : xgb_oof_preds,
    'lgbm' : lgbm_oof_preds
})
xgb_test_preds = sum(model.predict(test[Features]) for model in xgb_models) / len(xgb_models)
lgbm_test_preds = sum(lgbm_model.predict(test[Features]) for lgbm_model in lgbm_models) / len(lgbm_models)

def objective(trial):
    # Suggest weights for the ensemble
    w_lgbm = trial.suggest_float("w_lgbm", 0, 1)
    w_xgb = 1 - w_lgbm   # ensures sum to 1

    final_preds = (w_lgbm * X_stack['lgbm'] +w_xgb*X_stack['xgb'] )
    score = mean_squared_error(y_val, final_preds, squared = False) 
    return score

#study = optuna.create_study(direction="minimize")
#study.optimize(objective, n_trials=500)
#print("Best score:", study.best_value)
#print("Best params:", study.best_params)

In [5]:
w_lgbm = 0.5948591144361411
w_xgb = 1-w_lgbm
preds = w_lgbm * lgbm_test_preds + w_xgb * xgb_test_preds
submission = pd.DataFrame({'id': test['id'], 'BeatsPerMinute': preds})
submission.to_csv('submission.csv', index=False)
display(submission.head())

Unnamed: 0,id,BeatsPerMinute
0,524164,119.16034
1,524165,118.979788
2,524166,119.171792
3,524167,119.176289
4,524168,119.252642
