# **Gradient Boosting Experiments**

# **Set-up**

In [None]:
%pip install -q perpetual rgf_python starboost

In [1]:
# Import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import sys
from sklearn.metrics import root_mean_squared_log_error, root_mean_squared_error
from sklearn.model_selection import KFold
warnings.filterwarnings('ignore')

# Import ML
from sklearn.model_selection import cross_validate, cross_val_predict
from xgboost import XGBRegressor
from perpetual import PerpetualBooster
from lightgbm import LGBMRegressor, early_stopping
from rgf.sklearn import RGFRegressor
from catboost import CatBoostRegressor
import starboost as sb
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from autogluon.features.generators import AutoMLPipelineFeatureGenerator
import optuna


In [2]:
# Importing functions
from Ensembler import *

In [19]:
base_path = os.getenv('DATA_FOLDER_PATH', 'Data/')
#base_path = os.getenv('DATA_FOLDER_PATH', '/content/drive/MyDrive/DS_Projects/Playground_Series/Ps4e12_Regression_Insuranse_Premium_Prediction/Data/')

train = pd.read_parquet(os.path.join(base_path, 'train_transformed_fe.parquet'))
test = pd.read_parquet(os.path.join(base_path, 'test_transformed_fe.parquet'))
submission = pd.read_csv(os.path.join(base_path, 'sample_submission.csv'))
original = pd.read_parquet(os.path.join(base_path, 'original_transformed.parquet'))

# **Training function**

In [20]:
def train_bagged_model(
    X, 
    y, 
    model_name,
    model_params, 
    n_folds=5, 
    random_state=42
):
    """
    Train a bagged model using K-fold cross-validation approach.
    
    Args:
        X: Training features
        y: Target variable
        model_name: Boosting algorithm class (XGBRegressor, LGBMRegressor, etc)
        model_params: Parameters for the model
        n_folds: Number of folds for cross-validation
        random_state: Random seed
    
    Returns:
        oof_predictions: Out-of-fold predictions for training data
        models: List of trained models
    """
    # Initialize arrays for predictions
    oof_predictions = np.zeros(len(X))
    models = []
    
    # Initialize lists to store CV scores for each fold
    fold_rmse_scores = []
    
    # Create K-fold splits
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)

    # Check if model supports early stopping
    has_early_stopping = hasattr(model_name, 'early_stopping_rounds') or \
                        any(param in model_params for param in ['early_stopping_rounds', 'early_stopping'])
    if has_early_stopping:
        print("Early stopping enabled")
    
    # Train K models
    for fold, (train_idx, val_idx) in tqdm(enumerate(kf.split(X, y))):
        # Initialize and train model
        model = model_name(
            **model_params,
        )
        
        if 'LGBMRegressor' not in model_name.__name__ and has_early_stopping:
            model.fit(
                X.iloc[train_idx],
                y.iloc[train_idx],
                eval_set=[(X.iloc[val_idx], y.iloc[val_idx])],
                verbose=0
            )
        elif 'LGBMRegressor' in model_name.__name__:
            model.fit(
                X.iloc[train_idx],
                y.iloc[train_idx],
                eval_set=[(X.iloc[val_idx], y.iloc[val_idx])]
            )
        else:
            model.fit(
                X.iloc[train_idx],
                y.iloc[train_idx]
            )
        
        # Generate OOF predictions for this fold
        fold_preds = model.predict(X.iloc[val_idx])
        oof_predictions[val_idx] = fold_preds
        
        # Calculate and store RMSE for this fold
        fold_rmse = root_mean_squared_error(y.iloc[val_idx], fold_preds)
        fold_rmse_scores.append(fold_rmse)
        
        # Save model
        models.append(model)

    # Calculate mean and std of fold scores
    mean_rmse = np.mean(fold_rmse_scores)
    std_rmse = np.std(fold_rmse_scores)

    print(f"Cross-validation Scores (Mean ± Std):")
    print(f"RMSE Score: {mean_rmse:.4f} ± {std_rmse:.4f}")

    return oof_predictions, models

# **Feature generation**

## **Training and test**

In [21]:
target_log = train['premium_amount_log']
feature_generator = AutoMLPipelineFeatureGenerator()
train.drop(columns = ['policy_start_date', 'premium_amount_log'], inplace=True)
train_transformed = feature_generator.fit_transform(train, target_log)
test_transformed = feature_generator.transform(test)
features = train_transformed.columns

print("Features: ", features)
print("Target: ", target_log.head().values)



Features:  Index(['age', 'gender', 'annual_income', 'number_of_dependents',
       'health_score', 'previous_claims', 'vehicle_age', 'credit_score',
       'insurance_duration', 'smoking_status', 'year', 'month', 'day',
       'week_of_year', 'quarter', 'year_sin', 'month_sin', 'month_cos',
       'day_sin', 'day_cos', 'is_weekend', 'is_month_end', 'is_month_start',
       'is_quarter_end', 'is_quarter_start', 'policy_age_days',
       'week_of_month', 'days_in_month', 'days_remaining_in_month',
       'income_per_dependent', 'total_risk_score', 'claims_to_duration_ratio',
       'vehicle_to_driver_age_ratio', 'is_young_driver', 'lifestyle_score',
       'location_risk', 'location_avg_credit', 'responsibility_score',
       'family_risk_factor', 'asset_risk', 'dependent_income_ratio',
       '4log_WeightedEnsemble_L4', '12nonlog_WeightedEnsemble_L3',
       'xgb_complicated_probs', 'lgb_complicated_probs',
       'xgb_complicated_probs_rank', 'lgb_complicated_probs_rank',
       'marit

## **Original**

In [22]:
target_original = np.log1p(original['premium_amount'])
original.drop(columns = ['policy_start_date', 'premium_amount'], inplace=True)
feature_generator = AutoMLPipelineFeatureGenerator()
original_transformed = feature_generator.fit_transform(original, target_original)
features_original = original_transformed.columns

print("Features: ", features)
print("Target: ", target_original.head().values)



Features:  Index(['age', 'gender', 'annual_income', 'number_of_dependents',
       'health_score', 'previous_claims', 'vehicle_age', 'credit_score',
       'insurance_duration', 'smoking_status', 'year', 'month', 'day',
       'week_of_year', 'quarter', 'year_sin', 'month_sin', 'month_cos',
       'day_sin', 'day_cos', 'is_weekend', 'is_month_end', 'is_month_start',
       'is_quarter_end', 'is_quarter_start', 'policy_age_days',
       'week_of_month', 'days_in_month', 'days_remaining_in_month',
       'income_per_dependent', 'total_risk_score', 'claims_to_duration_ratio',
       'vehicle_to_driver_age_ratio', 'is_young_driver', 'lifestyle_score',
       'location_risk', 'location_avg_credit', 'responsibility_score',
       'family_risk_factor', 'asset_risk', 'dependent_income_ratio',
       '4log_WeightedEnsemble_L4', '12nonlog_WeightedEnsemble_L3',
       'xgb_complicated_probs', 'lgb_complicated_probs',
       'xgb_complicated_probs_rank', 'lgb_complicated_probs_rank',
       'marit

In [23]:
# Add 'Missing' category and fill NaN values for both train and test
categorical_features = [col for col in train_transformed.columns if train_transformed[col].dtype == 'category']
for col in categorical_features:
    # Add 'Missing' to categories
    train_transformed[col] = train_transformed[col].cat.add_categories('Missing')
    test_transformed[col] = test_transformed[col].cat.add_categories('Missing')
    original_transformed[col] = original_transformed[col].cat.add_categories('Missing')
    
    # Now fill NaN values
    train_transformed[col] = train_transformed[col].fillna('Missing')
    test_transformed[col] = test_transformed[col].fillna('Missing')
    original_transformed[col] = original_transformed[col].fillna('Missing')

# **Optuna Optimization**

In [26]:
def objective_lgbm(trial):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'n_estimators': trial.suggest_int('n_estimators', 300, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'num_leaves': trial.suggest_int('num_leaves', 20, 1024),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 500),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0),
        'verbose': -1,
        'early_stopping_rounds': 100,
        'n_jobs':-1
    }

    # Initialize KFold
    kf = KFold(n_splits=3, shuffle=True)
    scores = []

    # Perform k-fold cross validation
    for train_idx, valid_idx in kf.split(train_transformed[features]):
        X_train_fold = train_transformed[features].iloc[train_idx]
        y_train_fold = target_log.iloc[train_idx]
        X_valid_fold = train_transformed[features].iloc[valid_idx]
        y_valid_fold = target_log.iloc[valid_idx]

        model = LGBMRegressor(**params)
        model.fit(X_train_fold, y_train_fold,
                eval_set=[(X_valid_fold, y_valid_fold)],
                eval_metric='rmse')
        
        preds_fold = model.predict(X_valid_fold)
        fold_score = root_mean_squared_error(y_valid_fold, preds_fold)
        scores.append(fold_score)

    # Return average score across all folds
    return np.mean(scores)

study_lgbm = optuna.create_study(direction='minimize', 
                                 study_name="lgbm_tunning",
                                 storage="sqlite:///" + os.path.join(base_path, "optuna_lgb_tuning.db"),
                                 load_if_exists=True,)
study_lgbm.optimize(objective_lgbm, n_trials=50)
print("Best parameters for LightGBM:", study_lgbm.best_params)

[I 2024-12-31 13:35:47,500] Using an existing study with name 'lgbm_tunning' instead of creating a new one.
[I 2024-12-31 13:36:00,397] Trial 2 finished with value: 1.0446194751755853 and parameters: {'n_estimators': 916, 'learning_rate': 0.06265863073509052, 'num_leaves': 546, 'max_depth': 5, 'min_child_samples': 367, 'subsample': 0.9002722723060412, 'colsample_bytree': 0.8383653344073876, 'reg_alpha': 3.2877744185521007, 'reg_lambda': 8.31373515608923}. Best is trial 2 with value: 1.0446194751755853.
[I 2024-12-31 13:36:13,069] Trial 3 finished with value: 1.0446661099120897 and parameters: {'n_estimators': 959, 'learning_rate': 0.15217068150918125, 'num_leaves': 382, 'max_depth': 6, 'min_child_samples': 363, 'subsample': 0.8887117734071955, 'colsample_bytree': 0.7912968310301876, 'reg_alpha': 9.410271550390382, 'reg_lambda': 8.182802298486214}. Best is trial 2 with value: 1.0446194751755853.
[I 2024-12-31 13:36:50,889] Trial 4 finished with value: 1.0452072253158355 and parameters: 

Best parameters for LightGBM: {'n_estimators': 2816, 'learning_rate': 0.021471826254699997, 'num_leaves': 105, 'max_depth': 3, 'min_child_samples': 233, 'subsample': 0.8737928593252876, 'colsample_bytree': 0.9365249766889353, 'reg_alpha': 0.9650452151673026, 'reg_lambda': 6.661640637785808}


In [33]:
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 500),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0),
        'tree_method': 'hist',
        'eval_metric': 'rmse',
        'enable_categorical': True,
        'early_stopping_rounds': 100,
        'n_jobs':-1
        
    }


    # Initialize KFold
    kf = KFold(n_splits=3, shuffle=True)
    scores = []

    # Perform k-fold cross validation
    for train_idx, valid_idx in kf.split(train_transformed[features]):
        X_train_fold = train_transformed[features].iloc[train_idx]
        y_train_fold = target_log.iloc[train_idx]
        X_valid_fold = train_transformed[features].iloc[valid_idx]
        y_valid_fold = target_log.iloc[valid_idx]

        model = XGBRegressor(**params)
        model.fit(X_train_fold, y_train_fold,
                eval_set=[(X_valid_fold, y_valid_fold)],
                verbose=False)
        
        preds_fold = model.predict(X_valid_fold)
        fold_score = root_mean_squared_error(y_valid_fold, preds_fold)
        scores.append(fold_score)

    # Return average score across all folds
    return np.mean(scores)

study_xgb = optuna.create_study(direction='minimize', 
                                 study_name="xgb_tunning",
                                 storage="sqlite:///" + os.path.join(base_path, "optuna_xgb_tuning.db"),
                                 load_if_exists=True,)
study_xgb.optimize(objective_xgb, n_trials=50)
print("Best parameters for XGBoost:", study_xgb.best_params)

[I 2024-12-31 13:52:04,392] Using an existing study with name 'xgb_tunning' instead of creating a new one.
[I 2024-12-31 13:52:46,188] Trial 3 finished with value: 1.0450581038544389 and parameters: {'n_estimators': 2935, 'learning_rate': 0.08041260849258856, 'max_depth': 8, 'min_child_weight': 363, 'subsample': 0.7085111235451627, 'colsample_bytree': 0.5274600205234209, 'reg_alpha': 8.294206725590694, 'reg_lambda': 4.748755232667342}. Best is trial 3 with value: 1.0450581038544389.
[I 2024-12-31 13:53:29,566] Trial 4 finished with value: 1.0449855922375808 and parameters: {'n_estimators': 2715, 'learning_rate': 0.06552044914480966, 'max_depth': 8, 'min_child_weight': 389, 'subsample': 0.5250815173846461, 'colsample_bytree': 0.7439968703794873, 'reg_alpha': 2.0545222720593026, 'reg_lambda': 2.946861715008846}. Best is trial 4 with value: 1.0449855922375808.
[I 2024-12-31 13:54:10,344] Trial 5 finished with value: 1.044890852040283 and parameters: {'n_estimators': 2390, 'learning_rate':

Best parameters for XGBoost: {'n_estimators': 1013, 'learning_rate': 0.024735031012492463, 'max_depth': 4, 'min_child_weight': 337, 'subsample': 0.9528201038182489, 'colsample_bytree': 0.7070703378979604, 'reg_alpha': 5.8869061255702295, 'reg_lambda': 1.9492056530064512}


In [None]:
def objective_catboost(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 300, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 3, 12),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 500),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0),
        'random_strength': trial.suggest_float('random_strength', 1e-8, 10.0),
        'task_type': 'CPU',
        'eval_metric': 'RMSE',
        'early_stopping_rounds': 100,
        'cat_features': categorical_features,
        'thread_count': -1
    }

    # Initialize KFold
    kf = KFold(n_splits=3, shuffle=True)
    scores = []

    # Perform k-fold cross validation
    for train_idx, valid_idx in kf.split(train_transformed[features]):
        X_train_fold = train_transformed[features].iloc[train_idx]
        y_train_fold = target_log.iloc[train_idx]
        X_valid_fold = train_transformed[features].iloc[valid_idx]
        y_valid_fold = target_log.iloc[valid_idx]

        model = CatBoostRegressor(**params, verbose=False)
        model.fit(X_train_fold, y_train_fold,
                eval_set=[(X_valid_fold, y_valid_fold)],
                verbose=False)
        
        preds_fold = model.predict(X_valid_fold)
        fold_score = root_mean_squared_error(y_valid_fold, preds_fold)
        scores.append(fold_score)

    # Return average score across all folds
    return np.mean(scores)

study_catboost = optuna.create_study(direction='minimize',
                                   study_name="catboost_tunning",
                                   storage="sqlite:///" + os.path.join(base_path, "optuna_catboost_tuning.db"),
                                   load_if_exists=True)
study_catboost.optimize(objective_catboost, n_trials=50)
print("Best parameters for CatBoost:", study_catboost.best_params)


In [36]:
def prepare_rgf_data(df):
    df_rgf = df.copy()
    
    # Handle categorical features and convert to numeric
    categorical_features = [col for col in df_rgf.columns if df_rgf[col].dtype == 'category']
    for col in categorical_features:
        df_rgf[col] = df_rgf[col].cat.codes
    
    # Fill missing values with -999
    numeric_cols = df_rgf.select_dtypes(include=['int64', 'float64']).columns
    df_rgf[numeric_cols] = df_rgf[numeric_cols].fillna(-999)
    df_rgf = df_rgf.astype(float)
    # Remove date column if exists
    if 'policy_start_date' in df_rgf.columns:
        df_rgf.drop(columns="policy_start_date", inplace=True)
    return df_rgf

train_transformed_rgf = prepare_rgf_data(train_transformed)
test_transformed_rgf = prepare_rgf_data(test_transformed)
features_rgf = train_transformed_rgf.columns

# **Training with train and test**

## **LightGBM**

In [43]:
best_lgb_params = study_lgbm.best_params
best_lgb_params.update({
    'verbose': -1,
    'early_stopping_rounds': 100,
    'n_jobs':-1
})

lgb_oofs, lgb_models = train_bagged_model(train_transformed[features], target_log, 
                   model_name=LGBMRegressor, 
                   model_params=best_lgb_params, 
                   n_folds=5, 
                   random_state=42)

Early stopping enabled


5it [00:32,  6.58s/it]

Cross-validation Scores (Mean ± Std):
RMSE Score: 1.0446 ± 0.0008





## **XGBoost**

In [44]:
best_xgb_params = study_xgb.best_params
best_xgb_params.update({
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'enable_categorical': True,
    'early_stopping_rounds': 100,
    'n_jobs':-1
})

xgb_oofs, xgb_models = train_bagged_model(train_transformed[features], target_log, 
                   model_name=XGBRegressor, 
                   model_params=best_xgb_params, 
                   n_folds=5, 
                   random_state=42)


Early stopping enabled


5it [01:32, 18.58s/it]

Cross-validation Scores (Mean ± Std):
RMSE Score: 1.0446 ± 0.0008





## **CatBoost**

In [45]:
best_cgb_params = study_catboost.best_params
best_cgb_params.update({
    'task_type': 'CPU',
    'eval_metric': 'RMSE',
    'early_stopping_rounds': 100,
    'cat_features': categorical_features,
    'thread_count': -1
})

cgb_oofs, cgb_models = train_bagged_model(train_transformed[features], target_log, 
                   model_name=CatBoostRegressor, 
                   model_params=best_cgb_params, 
                   n_folds=5, 
                   random_state=42)


Early stopping enabled


5it [07:15, 87.15s/it] 

Cross-validation Scores (Mean ± Std):
RMSE Score: 1.0445 ± 0.0008





## **Regularized greedy forest**

In [None]:
rgf_params = {
    'max_leaf': 1000,              # Controls tree size
    'algorithm': "RGF_Sib",
    'loss': "LS",                  # Least squares loss
    'learning_rate': 0.01,
    'reg_depth': 1.0,              # Regularization depth
    'l2': 0.1,                     # L2 regularization
}

rgf_oofs, rgf_models = train_bagged_model(train_transformed_rgf[features_rgf], target_log, 
                   model_name=RGFRegressor, 
                   model_params=rgf_params, 
                   n_folds=3, 
                   random_state=42)


# **Training fg2**

In [42]:
train_fg2 = pd.read_csv(os.path.join(base_path, 'train_transformed_fg2.csv'))
test_fg2 = pd.read_csv(os.path.join(base_path, 'test_transformed_fg2.csv'))

In [50]:
train_fg2 = train_fg2.set_index("id")
test_fg2 = test_fg2.set_index("id")
target_fg2 = train_fg2['premium_amount']
train_fg2.drop(columns = ['premium_amount'], inplace=True)
features_fg2 = train_fg2.columns

## **LightGBM**

In [59]:
lgb_params = {"n_estimators": 3000,
    "learning_rate": 0.001,
    "max_depth": 8,
    "num_leaves": 2**8,
    "colsample_bytree": 0.5,
    "early_stopping_rounds": 100,
    "eval_metric": "rmse",
    "verbose":-1,
    'n_jobs':-1}

categorical_columns = train_fg2.select_dtypes(['object', 'category']).columns
lgb_train_fg2 = train_fg2.copy()
for col in categorical_columns:
    lgb_train_fg2[col] = lgb_train_fg2[col].astype('category').cat.codes.astype('int32')

lgb_fg2_oofs, lgb_fg2_models = train_bagged_model(lgb_train_fg2[features_fg2], target_fg2, 
                   model_name=LGBMRegressor, 
                   model_params=lgb_params, 
                   n_folds=3, 
                   random_state=40)

Early stopping enabled


3it [11:42, 234.24s/it]

Cross-validation Scores (Mean ± Std):
RMSE Score: 1.0456 ± 0.0011





## **XGBoost**

In [None]:
xgb_params = {
    'n_estimators': 3000,
    'max_depth': 8,
    'colsample_bytree': 0.5,
    'learning_rate': 0.001,
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'enable_categorical': True,
    'early_stopping_rounds':100,
    'n_jobs':-1,
    'verbose':-1
}

categorical_columns = train_fg2.select_dtypes(['object']).columns
xgb_train_fg2 = train_fg2.copy()
for col in categorical_columns:
    xgb_train_fg2[col] = xgb_train_fg2[col].astype('category')
    
xgb_fg2_oofs, xgb_fg2_models = train_bagged_model(xgb_train_fg2[features_fg2], target_fg2, 
                   model_name=XGBRegressor, 
                   model_params=xgb_params, 
                   n_folds=3, 
                   random_state=40)


In [62]:
categorical_columns = train_fg2.select_dtypes(['object', 'category']).columns
cgb_train_fg2 = train_fg2.copy()
for col in categorical_columns:
    cgb_train_fg2[col] = cgb_train_fg2[col].fillna('Missing')
    cgb_train_fg2[col] = cgb_train_fg2[col].astype('category')
cat_features = [cgb_train_fg2.columns.get_loc(col) for col in categorical_columns]

cat_params = {
    'depth': 10,
    'learning_rate': 0.01,
    'l2_leaf_reg': 3.5,
    'random_strength': 4,
    'bagging_temperature': 0.25,
    'eval_metric': 'RMSE',
    'loss_function': 'RMSE',
    'cat_features': cat_features,
    'iterations': 3000,
    'verbose': False
}

cgb_fg2_oofs, cgb_fg2_models = train_bagged_model(cgb_train_fg2[features_fg2], target_fg2, 
                   model_name=CatBoostRegressor, 
                   model_params=cat_params, 
                   n_folds=3, 
                   random_state=40)

3it [44:21, 887.04s/it]

Cross-validation Scores (Mean ± Std):
RMSE Score: 1.0331 ± 0.0011





## **Predicting on main train set**

In [65]:
# Create empty dataframe with target first
oofs_df = pd.DataFrame()
oofs_df['premium_amount_log'] = target_log
oofs_df['lgb_oofs'] = lgb_oofs
oofs_df['xgb_oofs'] = xgb_oofs
oofs_df['cgb_oofs'] = cgb_oofs
oofs_df['lgb_fg2_oofs'] = lgb_fg2_oofs
# oofs_df['xgb_fg2_oofs'] = xgb_fg2_oofs
oofs_df['cgb_fg2_oofs'] = cgb_fg2_oofs

# Convert predictions to original scale
pred_cols = oofs_df.columns
oofs_df[pred_cols] = np.expm1(oofs_df[pred_cols])

In [139]:
# for i, model in enumerate(lgb_fg2_models, 1):
#     oofs_df[f'lgb_fg2_{i}'] = model.predict(train_fg2[features])

# for i, model in enumerate(xgb_fg2_models, 1):
#     oofs_df[f'xgb_fg2_{i}'] = model.predict(train_fg2[features])

# for i, model in enumerate(cgb_fg2_models, 1):
#     oofs_df[f'cgb_fg2_{i}'] = model.predict(train_fg2[features])

# **Greedy Weighted Ensemble**

In [84]:
# Initialize and fit
model_cols = [col for col in oofs_df.columns if col != 'premium_amount_log']
ensembler = BaggedEnsembleSelection(n_init=3, max_iter=50, corr_threshold=0.7, bag_fraction=0.4, warm_start=30)
ensembler.fit(oofs_df[model_cols], oofs_df['premium_amount_log'], performance_func=lambda y, pred: -root_mean_squared_log_error(y, pred))


Best single model performance: -1.03312 | Model: cgb_fg2_oofs
Initial performance: -1.03312 | Models: ['cgb_fg2_oofs']
Iteration 1: Added cgb_fg2_oofs, Bag Score: -1.03308, Mean Bag Score: -1.03308, Full Score: -1.03312
Iteration 2: Added cgb_fg2_oofs, Bag Score: -1.03367, Mean Bag Score: -1.03337, Full Score: -1.03312
Iteration 3: Added cgb_fg2_oofs, Bag Score: -1.03282, Mean Bag Score: -1.03319, Full Score: -1.03312
Iteration 4: Added cgb_fg2_oofs, Bag Score: -1.03413, Mean Bag Score: -1.03343, Full Score: -1.03312
Iteration 5: Added cgb_fg2_oofs, Bag Score: -1.03307, Mean Bag Score: -1.03335, Full Score: -1.03312
Iteration 6: Added cgb_oofs, Bag Score: -1.03464, Mean Bag Score: -1.03357, Full Score: -1.03308
Iteration 7: Added cgb_fg2_oofs, Bag Score: -1.03269, Mean Bag Score: -1.03344, Full Score: -1.03305
Iteration 8: Added cgb_fg2_oofs, Bag Score: -1.03082, Mean Bag Score: -1.03311, Full Score: -1.03303
Iteration 9: Added cgb_fg2_oofs, Bag Score: -1.03234, Mean Bag Score: -1.0330

<Ensembler.BaggedEnsembleSelection at 0x2951160e0>

# **Predicting on test set**

In [72]:
def get_bagged_probs(models, data):
    """Get averaged probability predictions from all models."""
    probs = [model.predict(data) for model in models]
    return np.mean(probs, axis=0)

features = train_transformed.columns
features_rgf = train_transformed_rgf.columns
test_transformed['lgb_oofs'] = get_bagged_probs(lgb_models, test_transformed[features])
test_transformed['xgb_oofs'] = get_bagged_probs(xgb_models, test_transformed[features])
test_transformed['cgb_oofs'] = get_bagged_probs(cgb_models, test_transformed[features])
# test_transformed['rgf_oofs'] = get_bagged_probs(rgf_models, test_transformed_rgf[features_rgf])

In [73]:
categorical_columns = test_fg2.select_dtypes(['object', 'category']).columns
lgb_test_fg2 = test_fg2.copy()
for col in categorical_columns:
    lgb_test_fg2[col] = lgb_test_fg2[col].astype('category').cat.codes.astype('int32')
test_transformed['lgb_fg2_oofs'] = get_bagged_probs(lgb_fg2_models, lgb_test_fg2[features_fg2])


In [None]:
# categorical_columns = test_fg2.select_dtypes(['object']).columns
# xgb_test_fg2 = test_fg2.copy()
# for col in categorical_columns:
#     xgb_test_fg2[col] = xgb_test_fg2[col].astype('category')
# test_transformed['xgb_fg2_oofs'] = get_bagged_probs(xgb_fg2_models, xgb_test_fg2[features_fg2])


In [74]:
categorical_columns = test_fg2.select_dtypes(['object', 'category']).columns
cgb_test_fg2 = test_fg2.copy()
for col in categorical_columns:
    cgb_test_fg2[col] = cgb_test_fg2[col].fillna('Missing')
    cgb_test_fg2[col] = cgb_test_fg2[col].astype('category')
cat_features = [cgb_test_fg2.columns.get_loc(col) for col in categorical_columns]

test_transformed['cgb_fg2_oofs'] = get_bagged_probs(cgb_fg2_models, cgb_test_fg2[features_fg2])


In [141]:
# features = original_transformed.columns
# features_rgf = original_transformed_rgf.columns
# for i, model in enumerate(lgb_original_models, 1):
#     test_transformed[f'lgb_ori_{i}'] = model.predict(test_transformed[features])
    
# for i, model in enumerate(xgb_original_models, 1):
#     test_transformed[f'xgb_ori_{i}'] = model.predict(test_transformed[features])

# for i, model in enumerate(cgb_original_models, 1):
#     test_transformed[f'cgb_ori_{i}'] = model.predict(test_transformed[features])

# for i, model in enumerate(rgf_original_models, 1):
#     test_transformed[f'rgf_ori_{i}'] = model.predict(test_transformed_rgf[features_rgf])

In [85]:
#pred_cols = [col for col in test_transformed.columns if "_oofs" in col]
#test_transformed[pred_cols] = np.expm1(test_transformed[pred_cols])
test_transformed['ensemble_preds'] = ensembler.predict(test_transformed[model_cols])

# **Submission**

In [86]:
submission['Premium Amount'] = test_transformed['ensemble_preds'].values
submission.to_csv(os.path.join(base_path, 'submissions/submission_ensemble_fg1_fg2_lgb_xgb_cgb2.csv'), index=False)

In [87]:
submission['Premium Amount'] = test_transformed['cgb_fg2_oofs'].values
submission.to_csv(os.path.join(base_path, 'submissions/submission_cgb_fg2.csv'), index=False)
