# **Autogluon 12hr nonlog local**

In [None]:
%%capture
%pip install setuptools wheel autogluon.tabular[all,skex] dask[dataframe]
%pip install -U -q ipywidgets
%pip install -U scikit-learn

In [1]:
# Import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import cloudpickle
import plotly.io as pio
import plotly.graph_objects as go
from autogluon.core.metrics import make_scorer
import sklearn
from plotly.subplots import make_subplots
pd.options.plotting.backend = "plotly"
pio.templates.default = "simple_white"
warnings.filterwarnings('ignore')

# Import specific libraries
from autogluon.tabular import TabularDataset, TabularPredictor

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
base_path = os.getenv('DATA_FOLDER_PATH', 'Data/')
#base_path = os.getenv('DATA_FOLDER_PATH', '/content/drive/MyDrive/DS_Projects/Playground_Series/Ps4e12_Regression_Insuranse_Premium_Prediction/Data/')

train = pd.read_csv(os.path.join(base_path, 'train.csv'))
test = pd.read_csv(os.path.join(base_path, 'test.csv'))
submission = pd.read_csv(os.path.join(base_path, 'sample_submission.csv'))
original = pd.read_csv(os.path.join(base_path, 'Insurance Premium Prediction Dataset.csv'))

In [3]:
train.set_index('id', inplace=True)
test.set_index('id', inplace=True)

# Renaming columns for consistency
train.columns = train.columns.str.lower()
test.columns = test.columns.str.lower()
original.columns = original.columns.str.lower()
train.columns = [col.replace(" ", "_") for col in train.columns]
test.columns = [col.replace(" ", "_") for col in test.columns]
original.columns = [col.replace(" ", "_") for col in original.columns]
original  = original[train.columns]
original = original.dropna(subset=['premium_amount'])

# **Feature Engineering**

In [4]:
def create_date_features(df):
    # Basic date features
    df['policy_start'] = pd.to_datetime(df['policy_start_date'])
    df['year'] = df['policy_start'].dt.year
    df['month'] = df['policy_start'].dt.month
    df['day'] = df['policy_start'].dt.day
    df['week_of_year'] = df['policy_start'].dt.isocalendar().week.astype('int')
    df['day_of_week'] = df['policy_start'].dt.day_name()
    df['month_name'] = df['policy_start'].dt.month_name()
    df['quarter'] = df['policy_start'].dt.quarter

    # Cyclical encoding
    for col, max_val in [('year', 1), ('month', 12), ('day', 31)]:
        df[f'{col}_sin'] = np.sin(2 * np.pi * df[col] / max_val)
        df[f'{col}_cos'] = np.cos(2 * np.pi * df[col] / max_val)

    # Binary flags
    df['is_weekend'] = df['policy_start'].dt.dayofweek.isin([5,6]).astype(int)
    df['is_month_end'] = df['policy_start'].dt.is_month_end.astype(int)
    df['is_month_start'] = df['policy_start'].dt.is_month_start.astype(int)
    df['is_quarter_end'] = df['policy_start'].dt.is_quarter_end.astype(int)
    df['is_quarter_start'] = df['policy_start'].dt.is_quarter_start.astype(int)

    # Time-based calculations
    df['policy_age_days'] = (df['policy_start'].max() - df['policy_start']).dt.days
    df['week_of_month'] = df['day'].apply(lambda x: (x-1)//7 + 1)
    df['days_in_month'] = df['policy_start'].dt.days_in_month
    df['days_remaining_in_month'] = df['days_in_month'] - df['day']

    # Seasonal mapping
    season_map = {12:'winter', 1:'winter', 2:'winter',
                  3:'spring', 4:'spring', 5:'spring',
                  6:'summer', 7:'summer', 8:'summer',
                  9:'fall', 10:'fall', 11:'fall'}
    df['season'] = df['month'].map(season_map)

    return df

In [5]:
def create_advanced_features(df, is_training=True):
    """
    Create advanced features for insurance premium prediction with proper scaling
    """
    df = df.copy()

    # Store scaling factors during training
    if is_training:
        global scale_params
        scale_params = {
            'health_score_mean': df['health_score'].mean(),
            'health_score_std': df['health_score'].std(),
            'credit_score_mean': df['credit_score'].mean(),
            'credit_score_std': df['credit_score'].std(),
            'customer_feedback_map': {
                'Poor': 0.0,    # Higher risk
                'Average': 0.5, # Medium risk
                'Good': 1.0     # Lower risk
            },
            'exercise_frequency_map': {
                'Rarely': 0.0,   # Highest risk
                'Monthly': 0.33, # High risk
                'Weekly': 0.66,  # Low risk
                'Daily': 1.0     # Lowest risk
            },
            'smoking_map': {
                'Yes': 1.0,  # Higher risk
                'No': 0.0    # Lower risk
            },
            'marital_risk_map': {
                'Single': 1.0,    # Base risk
                'Married': 0.8,   # Lower risk (shared responsibility)
                'Divorced': 1.2   # Higher risk (potentially more financial stress)
            },
            'property_risk_map': {
                'Apartment': 1.0,  # Base risk
                'House': 1.5,     # Higher risk (more value/larger space)
                'Condo': 1.2      # Medium risk
            }
        }

    # 1. Date-based features
    df = create_date_features(df)

    # 2. Income-based features with proper scaling
    df['income_per_dependent'] = df['annual_income'] / (df['number_of_dependents'] + 1)
    df['income_bracket'] = pd.qcut(df['annual_income'], q=5,
                                 labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

    # 3. Risk Score Combinations with standardization
    # Standardize health and credit scores
    df['health_score_std'] = (df['health_score'] - scale_params['health_score_mean']) / scale_params['health_score_std']
    df['credit_score_std'] = (df['credit_score'] - scale_params['credit_score_mean']) / scale_params['credit_score_std']

    # Combined risk score (now both features are on same scale)
    df['total_risk_score'] = df['health_score_std'] + df['credit_score_std']

    # Claims ratio with insurance duration
    df['claims_to_duration_ratio'] = df['previous_claims'] / (df['insurance_duration'] + 1)

    # 4. Age-related interactions
    df['vehicle_to_driver_age_ratio'] = df['vehicle_age'] / df['age']
    df['is_young_driver'] = (df['age'] < 25).astype(int)
    df['is_senior_driver'] = (df['age'] > 65).astype(int)

    # 5. Lifestyle Score (normalized to 0-1 range)
    df['exercise_score'] = df['exercise_frequency'].map(scale_params['exercise_frequency_map'])
    df['smoking_risk'] = df['smoking_status'].map(scale_params['smoking_map'])
    df['lifestyle_score'] = (
        df['exercise_score'] * 0.4 +    # Exercise has significant impact
        (1 - df['smoking_risk']) * 0.4 + # Non-smoking is positive
        (df['health_score_std'] > 0) * 0.2  # Above average health is positive
    )

    # 6. Location-based features
    if is_training:
        scale_params['location_risk_map'] = df.groupby('location')['previous_claims'].mean()
        scale_params['location_credit_map'] = df.groupby('location')['credit_score'].mean()

    df['location_risk'] = df['location'].map(scale_params['location_risk_map'])
    df['location_avg_credit'] = df['location'].map(scale_params['location_credit_map'])

    # 7. Complex Interaction Features
    df['customer_feedback_score'] = df['customer_feedback'].map(scale_params['customer_feedback_map'])

    # Weighted responsibility score (all components now 0-1 scaled)
    df['responsibility_score'] = (
        df['credit_score_std'].clip(-3, 3) * 0.4 +  # Limit outlier effect
        df['customer_feedback_score'] * 0.3 +
        (1 - df['claims_to_duration_ratio'].clip(0, 1)) * 0.3  # Lower claims is better
    )

    # 8. Family and Property Risk
    df['marital_risk'] = df['marital_status'].map(scale_params['marital_risk_map'])
    df['property_risk'] = df['property_type'].map(scale_params['property_risk_map'])

    # Combined risk factors
    df['family_risk_factor'] = df['marital_risk'] * (df['number_of_dependents'] + 1)
    df['asset_risk'] = (
        df['property_risk'] * 0.6 +
        (df['vehicle_age'] / df['vehicle_age'].max()) * 0.4  # Normalized vehicle age
    )

    # 9. Customer Segment Features
    df['premium_segment'] = 'Standard'
    mask_premium = (
        (df['credit_score_std'] > 1) &  # Above 1 std in credit
        (df['previous_claims'] == 0) &   # No claims
        (df['health_score_std'] > 1)     # Above 1 std in health
    )
    mask_high_risk = (
        (df['credit_score_std'] < -1) |  # Below 1 std in credit
        (df['previous_claims'] > 3)       # Multiple claims
    )

    df.loc[mask_premium, 'premium_segment'] = 'Premium'
    df.loc[mask_high_risk, 'premium_segment'] = 'High Risk'

    # 10. Additional Ratio Features
    df['claims_per_year'] = df['previous_claims'] / (df['insurance_duration'] + 1)
    df['dependent_income_ratio'] = df['number_of_dependents'] / df['annual_income']

    # Drop intermediate columns
    intermediate_cols = ['health_score_std', 'credit_score_std', 'exercise_score',
                        'smoking_risk', 'customer_feedback_score', 'marital_risk',
                        'property_risk']
    df = df.drop(columns=[col for col in intermediate_cols if col in df.columns])

    return df

In [6]:
try:
    # Transform training data
    train_transformed = create_advanced_features(train, is_training=True)
    print("Training data transformation successful!")

    # Transform test data
    test_transformed = create_advanced_features(test, is_training=False)
    print("Test data transformation successful!")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    print("Please check your data types and column names.")

Training data transformation successful!
Test data transformation successful!


In [7]:
train_transformed.to_parquet(os.path.join(base_path, "train_transformed.parquet"))
test_transformed.to_parquet(os.path.join(base_path, "test_transformed.parquet"))

# **Autogluon Train 12 hours**

In [9]:
# Create the AutoGluon scorer using sklearn's implementation
rmsle_scorer = make_scorer(
    name='rmsle',
    score_func=sklearn.metrics.root_mean_squared_log_error,
    optimum=0,
    greater_is_better=False,
    needs_pred=True
)

In [11]:
# Setting up
label = 'premium_amount'
problem_type='regression'
hours = 12

# Initialize the TabularPredictor
predictor = TabularPredictor(label=label,
                             problem_type=problem_type,
                             eval_metric=rmsle_scorer,
                             path = "Autogluon/202412_ps4s12_12hr_training")

# Fit the model
predictor.fit(train_data=train_transformed,
              time_limit=3600*hours,
              presets="best_quality"
)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.14
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.0.0: Fri Sep 15 14:43:05 PDT 2023; root:xnu-10002.1.13~1/RELEASE_ARM64_T6020
CPU Count:          10
Memory Avail:       4.12 GB / 16.00 GB (25.7%)
Disk Space Avail:   86.75 GB / 460.43 GB (18.8%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked

[36m(_ray_fit pid=10865)[0m [1000]	valid_set's l2: 699160	valid_set's rmsle: -1.13518
[36m(_ray_fit pid=10865)[0m [2000]	valid_set's l2: 698759	valid_set's rmsle: -1.13448
[36m(_ray_fit pid=11097)[0m [1000]	valid_set's l2: 700323	valid_set's rmsle: -1.13651
[36m(_ray_fit pid=11097)[0m [2000]	valid_set's l2: 700119	valid_set's rmsle: -1.13573
[36m(_ray_fit pid=11097)[0m [3000]	valid_set's l2: 700402	valid_set's rmsle: -1.13565


[36m(_dystack pid=10791)[0m 		[36mray::_ray_fit()[39m (pid=11097, ip=127.0.0.1)
[36m(_dystack pid=10791)[0m   File "/opt/homebrew/lib/python3.10/site-packages/autogluon/core/models/ensemble/fold_fitting_strategy.py", line 402, in _ray_fit
[36m(_dystack pid=10791)[0m     fold_model.fit(X=X_fold, y=y_fold, X_val=X_val_fold, y_val=y_val_fold, time_limit=time_limit_fold, **resources, **kwargs_fold)
[36m(_dystack pid=10791)[0m   File "/opt/homebrew/lib/python3.10/site-packages/autogluon/core/models/abstract/abstract_model.py", line 856, in fit
[36m(_dystack pid=10791)[0m     out = self._fit(**kwargs)
[36m(_dystack pid=10791)[0m   File "/opt/homebrew/lib/python3.10/site-packages/autogluon/tabular/models/lgb/lgb_model.py", line 218, in _fit
[36m(_dystack pid=10791)[0m     self.model = train_lgb_model(early_stopping_callback_kwargs=early_stopping_callback_kwargs, **train_params)
[36m(_dystack pid=10791)[0m   File "/opt/homebrew/lib/python3.10/site-packages/autogluon/tabular/m

[36m(_ray_fit pid=11403)[0m [1000]	valid_set's l2: 695227	valid_set's rmsle: -1.13135
[36m(_ray_fit pid=11763)[0m [1000]	valid_set's l2: 697560	valid_set's rmsle: -1.1286


[36m(_dystack pid=10791)[0m 	-1.1314	 = Validation score   (-rmsle)
[36m(_dystack pid=10791)[0m 	137.12s	 = Training   runtime
[36m(_dystack pid=10791)[0m 	2.58s	 = Validation runtime
[36m(_dystack pid=10791)[0m Fitting model: RandomForestMSE_BAG_L1 ... Training model for up to 6918.52s of the 10516.58s of remaining time.
[36m(_dystack pid=10791)[0m 	-1.1286	 = Validation score   (-rmsle)
[36m(_dystack pid=10791)[0m 	774.81s	 = Training   runtime
[36m(_dystack pid=10791)[0m 	22.28s	 = Validation runtime
[36m(_dystack pid=10791)[0m Fitting model: CatBoost_BAG_L1 ... Training model for up to 6120.63s of the 9718.69s of remaining time.
[36m(_dystack pid=10791)[0m 	Memory not enough to fit 8 folds in parallel. Will train 1 folds in parallel instead (Estimated 52.33% memory usage per fold, 52.33%/80.00% total).
[36m(_dystack pid=10791)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (1 workers, per: cpus=10, gpus=0, memory=52.33%)

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x296abe5c0>

# **Best Submission**

In [None]:
predictor = TabularPredictor.load(os.path.join(base_path, "Autogluon/202411_ps4s11_8hr_logloss_gpu"))

In [12]:
leaderboard_test = predictor.leaderboard(silent=True)
leaderboard_test

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,-1.073038,rmsle,1301.780608,19654.457292,0.015892,6.631069,3,True,26
1,WeightedEnsemble_L2,-1.073907,rmsle,12.751272,1674.268837,0.017444,3.736816,2,True,14
2,NeuralNetTorch_BAG_L1,-1.075394,rmsle,5.477499,606.360768,5.477499,606.360768,1,True,6
3,NeuralNetTorch_r79_BAG_L1,-1.081999,rmsle,7.256329,1064.171253,7.256329,1064.171253,1,True,9
4,NeuralNetTorch_BAG_L2,-1.101687,rmsle,1301.764716,19647.826223,6.468448,847.574484,2,True,22
5,RandomForestMSE_BAG_L1,-1.128515,rmsle,26.52796,951.525635,26.52796,951.525635,1,True,3
6,RandomForestMSE_BAG_L2,-1.128615,rmsle,1325.387813,21195.407501,30.091545,2395.155762,2,True,17
7,ExtraTreesMSE_BAG_L2,-1.128645,rmsle,1321.108152,19193.180725,25.811884,392.928986,2,True,19
8,LightGBMXT_BAG_L2,-1.128671,rmsle,1297.640968,18867.985766,2.3447,67.734027,2,True,15
9,XGBoost_BAG_L2,-1.128677,rmsle,1298.952531,21353.308266,3.656263,2553.056527,2,True,21


In [15]:
models = leaderboard_test.head(5)['model'].to_list()
best_model = models[0]
print("Best model by autogluon is", models[0])
print("With a score of RMSLE", np.abs(leaderboard_test[leaderboard_test['model']==best_model]['score_val'][0]))

training = "12hr_nonlog_local"
sub_autogluon = submission.copy()
sub_autogluon['Premium Amount'] = predictor.predict(test_transformed, as_pandas=False, model=best_model)
sub_autogluon.to_csv(os.path.join(base_path, f"submission_{best_model}_{training}.csv"), index=False)

Best model by autogluon is WeightedEnsemble_L3
With a score of RMSLE 1.073038009875013


In [18]:
def save_experiment_oofs(predictor, models, experiment_name, path, islog=True):
    """
    Save OOF predictions as a single DataFrame with experiment identifier in column names
    """
    # Create DataFrame with index from training data
    oof_df = pd.DataFrame(index=predictor.predict_oof().index)

    # Add OOF predictions for each model with experiment identifier
    for i, model in enumerate(models):
        print(f"Generating OOF predictions for {model} - {i}/{len(models)}")
        oof_preds = predictor.predict_oof(model=model)
        if islog:
            oof_preds = np.power(10, oof_preds)
        col_name = f"{experiment_name}_{model}"
        oof_df[col_name] = oof_preds

    # Save DataFrame
    filename = f"oof_preds_{experiment_name}.csv"
    filepath = os.path.join(path, filename)

    oof_df.to_csv(filepath)
    print(f"Saved {len(models)} model predictions for experiment {experiment_name}")
    return oof_df

# Saving OOFs for later use
oofs_path = "Data/oofs/"
models = leaderboard_test['model'].to_list()
experiment_name = "12nonlog"
oof_df = save_experiment_oofs(predictor, models, experiment_name, oofs_path, islog=False)

Generating OOF predictions for WeightedEnsemble_L3 - 0/26
Generating OOF predictions for WeightedEnsemble_L2 - 1/26
Generating OOF predictions for NeuralNetTorch_BAG_L1 - 2/26
Generating OOF predictions for NeuralNetTorch_r79_BAG_L1 - 3/26
Generating OOF predictions for NeuralNetTorch_BAG_L2 - 4/26
Generating OOF predictions for RandomForestMSE_BAG_L1 - 5/26
Generating OOF predictions for RandomForestMSE_BAG_L2 - 6/26
Generating OOF predictions for ExtraTreesMSE_BAG_L2 - 7/26
Generating OOF predictions for LightGBMXT_BAG_L2 - 8/26
Generating OOF predictions for XGBoost_BAG_L2 - 9/26
Generating OOF predictions for CatBoost_BAG_L2 - 10/26
Generating OOF predictions for CatBoost_r177_BAG_L2 - 11/26
Generating OOF predictions for LightGBM_BAG_L2 - 12/26
Generating OOF predictions for LightGBMLarge_BAG_L2 - 13/26
Generating OOF predictions for NeuralNetFastAI_BAG_L2 - 14/26
Generating OOF predictions for ExtraTreesMSE_BAG_L1 - 15/26
Generating OOF predictions for LightGBMLarge_BAG_L1 - 16/2