# **Ensembling Autogluon OOFs**

In [1]:
# Import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import plotly.io as pio
import plotly.graph_objects as go
from autogluon.core.metrics import make_scorer
from plotly.subplots import make_subplots
from sklearn.metrics import root_mean_squared_log_error, root_mean_squared_error
from sklearn.model_selection import KFold
pd.options.plotting.backend = "plotly"
pio.templates.default = "simple_white"
warnings.filterwarnings('ignore')

# Import specific libraries
from autogluon.tabular import TabularDataset, TabularPredictor

In [2]:
def calculate_cv_rmsle(y_true, y_pred, n_folds=10):
    kf = KFold(n_splits=n_folds, shuffle=True)
    fold_scores = []
    
    for fold, (_, val_idx) in enumerate(kf.split(y_true)):
        fold_rmsle = root_mean_squared_log_error(
            y_true.iloc[val_idx],
            y_pred.iloc[val_idx]
        )
        fold_scores.append(fold_rmsle)
    
    return {
        'cv_scores': fold_scores,
        'mean_cv': np.mean(fold_scores),
        'std_cv': np.std(fold_scores)
    }

In [3]:
base_path = os.getenv('DATA_FOLDER_PATH', 'Data/')
#base_path = os.getenv('DATA_FOLDER_PATH', '/content/drive/MyDrive/DS_Projects/Playground_Series/Ps4e12_Regression_Insuranse_Premium_Prediction/Data/')

train = pd.read_parquet(os.path.join(base_path, 'train_transformed.parquet'))
test = pd.read_parquet(os.path.join(base_path, 'test_transformed.parquet'))
submission = pd.read_csv(os.path.join(base_path, 'sample_submission.csv'))
original = pd.read_csv(os.path.join(base_path, 'Insurance Premium Prediction Dataset.csv'))

In [4]:
# Reading OOFs
oof_12hr_nonlog_local = pd.read_parquet("Data/oofs/oof_preds_12nonlog.parquet")
oof_4hr_log_tpu = pd.read_parquet("Data/oofs/oof_preds_4log.parquet")

# Fusing
oof_12hr_nonlog_local.set_index('id', inplace=True)
train_oofs = ( train[['premium_amount']]
              .merge(oof_12hr_nonlog_local, left_index=True, right_index=True)
              .merge(oof_4hr_log_tpu, left_index=True, right_index=True) 
              )

# **Performance**

In [11]:
# Calculate CV scores for each OOF prediction
oof_columns = train_oofs.columns[1:]  # Skip the first column (premium_amount)
cv_results = {}
for col in oof_columns:
    scores = calculate_cv_rmsle(
        train_oofs['premium_amount'],
        train_oofs[col],
        n_folds=5
    )
    cv_results[col] = scores

In [12]:
# Create summary table
cv_summary = pd.DataFrame([{
    'Model': model,
    'Mean RMSLE': scores['mean_cv'],
    'Std RMSLE': scores['std_cv'], 
    'n_folds': len(scores['cv_scores'])
} for model, scores in cv_results.items()])

cv_summary = cv_summary.sort_values('Mean RMSLE')
cv_summary

Unnamed: 0,Model,Mean RMSLE,Std RMSLE,n_folds
26,4log_WeightedEnsemble_L4,1.044443,0.002046,5
27,4log_WeightedEnsemble_L3,1.044464,0.002612,5
28,4log_LightGBM_r131_BAG_L3,1.044553,0.001178,5
29,4log_CatBoost_BAG_L3,1.044553,0.001622,5
30,4log_LightGBM_BAG_L3,1.044560,0.000830,5
...,...,...,...,...
23,12nonlog_XGBoost_r33_BAG_L1,1.170174,0.002048,5
64,4log_KNeighborsDist_BAG_L1,1.201113,0.002137,5
63,4log_KNeighborsUnif_BAG_L1,1.201113,0.001864,5
24,12nonlog_KNeighborsUnif_BAG_L1,1.207578,0.003399,5


In [42]:
from collections import Counter
import numpy as np
import pandas as pd

class BaggedEnsembleSelection:
    def __init__(self, n_init=5, max_iter=30, decimals=5, corr_threshold=0.7, 
                 bag_fraction=0.25, epsilon=0.00001, warm_start=10):
        self.n_init = n_init
        self.max_iter = max_iter
        self.decimals = decimals
        self.corr_threshold = corr_threshold
        self.bag_fraction = bag_fraction
        self.epsilon = epsilon
        self.warm_start = warm_start
        self.best_models = None
        self.best_weights = None
        self.best_performance = None
    
    def get_diverse_init_models(self, model_performances, model_cols, corr_matrix):
        """Get initial diverse models based on correlation threshold"""
        # Start with best model
        init_models = [model_performances.idxmax()]
        available_models = list(model_cols)
        available_models.remove(init_models[0])
        
        while len(init_models) < self.n_init and available_models:
            # Remove highly correlated models using vectorized operations
            corr_with_selected = corr_matrix.loc[available_models, init_models].max(axis=1)
            available_models = [m for m, c in zip(available_models, corr_with_selected) 
                              if c <= self.corr_threshold]
            
            if not available_models:
                break
                
            # Add best remaining model
            best_model = model_performances[available_models].idxmax()
            init_models.append(best_model)
            available_models.remove(best_model)
            
        return init_models
    
    def get_ensemble_preds(self, X, ensemble):
        """Get predictions for current ensemble using vectorized operations"""
        if not ensemble:
            return np.zeros(len(X))
            
        ensemble_weights = pd.Series(ensemble) / sum(ensemble.values())
        return X[ensemble_weights.index].multiply(ensemble_weights).sum(axis=1)
    
    def fit(self, X, y, performance_func):
        """
        Fit ensemble using forward selection with bagging
        
        Parameters:
        -----------
        X : pd.DataFrame
            DataFrame containing model predictions as columns
        y : pd.Series
            True target values
        performance_func : callable
            Function that takes (y_true, y_pred) and returns a score to maximize
        """
        model_cols = X.columns
        n_samples = len(y)
        
        # Calculate initial model performances
        model_performances = pd.Series({
        col: performance_func(y, X[col]) for col in model_cols
        })
        
        best_single = model_performances.max()
        print(f"Best single model performance: {best_single:.5f} | Model: {model_performances.idxmax()}")

        # Get diverse initial models
        if self.n_init > 1:
            corr_matrix = X.corr().abs()
            init_models = self.get_diverse_init_models(model_performances, model_cols, corr_matrix)
        else:
            init_models = [model_performances.idxmax()]
            
        # Initialize ensemble
        ensemble = Counter(init_models)
        current_preds = self.get_ensemble_preds(X, ensemble)
        current_performance = performance_func(y, current_preds)
        print(f"Initial performance: {current_performance:.5f} | Models: {init_models}")
        
        # Track best ensemble
        best_ensemble = ensemble.copy()
        best_mean_performance = float('-inf')
        best_iteration = 0
        
        # Early stopping variables
        bag_scores = []
        consecutive_decreases = 0
        previous_mean_score = float('-inf')
        
        # Greedy forward selection with bagging
        bag_size = int(n_samples * self.bag_fraction)

        for i in range(self.max_iter):
            # Sample indices for this iteration
            bag_indices = np.random.choice(n_samples, size=bag_size, replace=False)
            
            # Get current ensemble size
            n_models = sum(ensemble.values())
            
            # Convert to numpy arrays before multi-dimensional indexing
            current_preds_np = current_preds.to_numpy() if isinstance(current_preds, pd.Series) else current_preds
            X_np = X.values
            
            # Vectorized calculation of all candidate predictions
            current_preds_expanded = current_preds_np[:, np.newaxis]
            candidate_preds = (n_models * current_preds_expanded + X_np) / (n_models + 1)
            
            # Calculate scores for all candidates at once
            candidate_scores = {
                model: performance_func(y.iloc[bag_indices], candidate_preds[bag_indices, j])
                for j, model in enumerate(model_cols)
            }
            
            best_model = max(candidate_scores.items(), key=lambda x: x[1])[0]
            best_score = candidate_scores[best_model]
            
            # Update ensemble
            ensemble.update({best_model: 1})
            current_preds = self.get_ensemble_preds(X, ensemble)
            full_performance = performance_func(y, current_preds)
            
            # Early stopping check
            bag_scores.append(best_score)
            current_mean_score = np.mean(bag_scores[-self.warm_start:])
            
            if i >= self.warm_start:
                if current_mean_score < previous_mean_score:
                    consecutive_decreases += 1
                    if consecutive_decreases >= 3:
                        print(f"\nEarly stopping triggered at iteration {i+1}")
                        break
                else:
                    consecutive_decreases = 0
                
                if current_mean_score > best_mean_performance:
                    best_mean_performance = current_mean_score
                    best_ensemble = ensemble.copy()
                    best_iteration = i + 1
                    
            previous_mean_score = current_mean_score
            
            print(f"Iteration {i+1}: Added {best_model}, Bag Score: {best_score:.5f}, "
                  f"Mean Bag Score: {current_mean_score:.5f}, Full Score: {full_performance:.5f}")
        
        # Convert counter to weights
        total_count = sum(best_ensemble.values())
        self.best_weights = pd.Series(best_ensemble) / total_count
        self.best_models = self.best_weights.index
        self.best_mean_performance = best_mean_performance
        
        print("\nFinal Ensemble Weights:")
        for model, weight in self.best_weights.sort_values(ascending=False).items():
            print(f"{model}: {weight:.4f}")
        
        return self
    
    def get_best_ensemble(self):
        """Return DataFrame with models and their weights"""
        return pd.DataFrame({
            'model': self.best_weights.index,
            'weight': self.best_weights.values
        }).sort_values('weight', ascending=False)
                
    def predict(self, X):
        """Generate ensemble predictions for new data"""
        if self.best_weights is None:
            raise ValueError("Model must be fitted before making predictions")
        return (X[self.best_models] * self.best_weights.values).sum(axis=1)

In [None]:
# Initialize and fit
model_cols = train_oofs.columns[1:]  # All columns except premium_amount
ensembler = BaggedEnsembleSelection(n_init=2, max_iter=50, corr_threshold=0.7, bag_fraction=0.25, warm_start=10)
ensembler.fit(train_oofs[model_cols], train_oofs['premium_amount'], performance_func=lambda y, pred: -root_mean_squared_log_error(y, pred))


In [70]:
# Apply log10 transformation to all model predictions
train_oofs[train_oofs.columns] = np.log10(train_oofs[train_oofs.columns].clip(lower=1e-10))


In [75]:
# Initialize and fit
ensembler = BaggedEnsembleSelection(n_init=2, max_iter=50, corr_threshold=0.7, bag_fraction=0.2, warm_start=20)
ensembler.fit(train_oofs[model_cols], train_oofs['premium_amount'], performance_func=lambda y, pred: -root_mean_squared_error(y, pred))


Best single model performance: -0.45617 | Model: 4log_WeightedEnsemble_L4
Initial performance: -0.46257 | Models: ['4log_WeightedEnsemble_L4', '12nonlog_NeuralNetTorch_BAG_L2']
Iteration 1: Added 4log_NeuralNetFastAI_BAG_L3, Bag Score: -0.45747, Mean Bag Score: -0.45747, Full Score: -0.45901
Iteration 2: Added 4log_WeightedEnsemble_L3, Bag Score: -0.45808, Mean Bag Score: -0.45777, Full Score: -0.45777
Iteration 3: Added 4log_RandomForestMSE_BAG_L2, Bag Score: -0.45661, Mean Bag Score: -0.45738, Full Score: -0.45721
Iteration 4: Added 4log_NeuralNetFastAI_r191_BAG_L2, Bag Score: -0.45518, Mean Bag Score: -0.45683, Full Score: -0.45689
Iteration 5: Added 4log_ExtraTreesMSE_BAG_L3, Bag Score: -0.45688, Mean Bag Score: -0.45684, Full Score: -0.45670
Iteration 6: Added 4log_NeuralNetFastAI_BAG_L3, Bag Score: -0.45670, Mean Bag Score: -0.45682, Full Score: -0.45658
Iteration 7: Added 4log_LightGBMLarge_BAG_L3, Bag Score: -0.45755, Mean Bag Score: -0.45692, Full Score: -0.45649
Iteration 8: 

<__main__.BaggedEnsembleSelection at 0x2d4c47c40>

In [77]:
train_oofs['ensemble_preds'] = ensembler.predict(train_oofs[model_cols])

# Convert back from log10 to original scale
train_oofs[train_oofs.columns] = 10 ** train_oofs[train_oofs.columns]


In [79]:
root_mean_squared_log_error(train_oofs['premium_amount'], train_oofs['ensemble_preds'])

1.0444705356658068

# **Understanding model errors**



In [7]:
errors = (train_oofs[['premium_amount', '4log_WeightedEnsemble_L4']]
 .assign(premium_amount_log = lambda x: np.log10(x['premium_amount']))
 .assign(preds_log = lambda x: np.log10(x['4log_WeightedEnsemble_L4']))
 .assign(error = lambda x: x['premium_amount_log'] - x['preds_log'])
 .assign(error_squared = lambda x: x['error'] ** 2)
 .sort_values('error_squared', ascending=False)
 )




In [8]:
# Create error bins
error_bins = pd.cut(errors['error_squared'], 
                   bins=[0, 1, 1.5, 2, float('inf')],
                   labels=['< 1', '1-1.5', '1.5-2', '> 2'],
                   include_lowest=True)

# Group by error bins and get counts
error_distribution = pd.DataFrame({
    'count': error_bins.value_counts(),
    'percentage': error_bins.value_counts(normalize=True) * 100
}).sort_index()

print("\nDistribution of squared errors:")
print(error_distribution)



Distribution of squared errors:
                 count  percentage
error_squared                     
< 1            1135263   94.605250
1-1.5            14054    1.171167
1.5-2            25638    2.136500
> 2              25045    2.087083


In [10]:
errors_small = errors[errors['error_squared'] < 1]
root_mean_squared_log_error(errors_small['premium_amount'], errors_small['4log_WeightedEnsemble_L4'])

0.7782893142875139

In [19]:
submission_4log = pd.read_csv('Data/submissions/submission_WeightedEnsemble_L4_4hr_log_tpu.csv')
submission_12nonlog = pd.read_csv('Data/submissions/submission_WeightedEnsemble_L3_12hr_nonlog_local.csv')

In [20]:
submission_4log = submission_4log.set_index('id').rename(columns={'Premium Amount': '4log_WeightedEnsemble_L4'})
submission_12nonlog = submission_12nonlog.set_index('id').rename(columns={'Premium Amount': '12nonlog_WeightedEnsemble_L3'})


In [23]:
submission_oofs = submission_4log.merge(submission_12nonlog, left_index=True, right_index=True)
submission_oofs.to_csv('Data/oofs/test_oofs.csv')
submission_oofs

Unnamed: 0_level_0,4log_WeightedEnsemble_L4,12nonlog_WeightedEnsemble_L3
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1200000,825.73157,929.62250
1200001,813.55566,912.69617
1200002,801.74110,854.73520
1200003,812.36896,878.03625
1200004,753.27423,848.68390
...,...,...
1999995,966.78390,957.26044
1999996,567.34860,1003.26294
1999997,816.53296,885.10284
1999998,824.32794,925.06720
