In [1]:
%%capture
%pip install setuptools wheel autogluon.tabular[all] dask[dataframe]
%pip install -U -q ipywidgets
%pip install -U scikit-learn

In [2]:
# Import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import cloudpickle
import plotly.io as pio
import plotly.graph_objects as go
from autogluon.core.metrics import make_scorer
import sklearn
from plotly.subplots import make_subplots
pd.options.plotting.backend = "plotly"
pio.templates.default = "simple_white"
warnings.filterwarnings('ignore')

# Import specific libraries
from autogluon.tabular import TabularDataset, TabularPredictor

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
base_path = os.getenv('DATA_FOLDER_PATH', '/content/drive/MyDrive/DS_Projects/Playground_Series/Ps4e12_Regression_Insuranse_Premium_Prediction/Data/')
train = pd.read_csv(os.path.join(base_path, 'train.csv'))
test = pd.read_csv(os.path.join(base_path, 'test.csv'))
submission = pd.read_csv(os.path.join(base_path, 'sample_submission.csv'))
original = pd.read_csv(os.path.join(base_path, 'Insurance Premium Prediction Dataset.csv'))
train_oofs = pd.read_csv(os.path.join(base_path, 'oofs/top_oofs_models.csv'))
test_oofs = pd.read_csv(os.path.join(base_path, 'oofs/test_oofs.csv'))

Lets prepare de datasets

In [17]:
train.set_index('id', inplace=True)
test.set_index('id', inplace=True)

# Renaming columns for consistency
train.columns = train.columns.str.lower()
test.columns = test.columns.str.lower()
original.columns = original.columns.str.lower()
train.columns = [col.replace(" ", "_") for col in train.columns]
test.columns = [col.replace(" ", "_") for col in test.columns]
original.columns = [col.replace(" ", "_") for col in original.columns]
original  = original[train.columns]
original = original.dropna(subset=['premium_amount'])

train

Unnamed: 0_level_0,age,gender,annual_income,marital_status,number_of_dependents,education_level,occupation,health_score,location,policy_type,previous_claims,vehicle_age,credit_score,insurance_duration,policy_start_date,customer_feedback,smoking_status,exercise_frequency,property_type,premium_amount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,36.0,Female,27316.0,Married,0.0,Master's,Unemployed,13.772907,Urban,Premium,,5.0,372.0,3.0,2023-05-03 15:21:39.257696,Poor,No,Daily,Apartment,1303.0
1199996,54.0,Male,35786.0,Divorced,,Master's,Self-Employed,11.483482,Rural,Comprehensive,,10.0,597.0,4.0,2022-09-10 15:21:39.134960,Poor,No,Weekly,Apartment,821.0
1199997,19.0,Male,51884.0,Divorced,0.0,Master's,,14.724469,Suburban,Basic,0.0,19.0,,6.0,2021-05-25 15:21:39.106582,Good,No,Monthly,Condo,371.0
1199998,55.0,Male,,Single,1.0,PhD,,18.547381,Suburban,Premium,1.0,7.0,407.0,4.0,2021-09-19 15:21:39.190215,Poor,No,Daily,Apartment,596.0


# **Feature Engineering**

Lets do some feature engineering with help from `claude.ai`


In [18]:
def create_date_features(df):
    # Basic date features
    df['policy_start'] = pd.to_datetime(df['policy_start_date'])
    df['year'] = df['policy_start'].dt.year
    df['month'] = df['policy_start'].dt.month
    df['day'] = df['policy_start'].dt.day
    df['week_of_year'] = df['policy_start'].dt.isocalendar().week.astype('int')
    df['day_of_week'] = df['policy_start'].dt.day_name()
    df['month_name'] = df['policy_start'].dt.month_name()
    df['quarter'] = df['policy_start'].dt.quarter

    # Cyclical encoding
    for col, max_val in [('year', 1), ('month', 12), ('day', 31)]:
        df[f'{col}_sin'] = np.sin(2 * np.pi * df[col] / max_val)
        df[f'{col}_cos'] = np.cos(2 * np.pi * df[col] / max_val)

    # Binary flags
    df['is_weekend'] = df['policy_start'].dt.dayofweek.isin([5,6]).astype(int)
    df['is_month_end'] = df['policy_start'].dt.is_month_end.astype(int)
    df['is_month_start'] = df['policy_start'].dt.is_month_start.astype(int)
    df['is_quarter_end'] = df['policy_start'].dt.is_quarter_end.astype(int)
    df['is_quarter_start'] = df['policy_start'].dt.is_quarter_start.astype(int)

    # Time-based calculations
    df['policy_age_days'] = (df['policy_start'].max() - df['policy_start']).dt.days
    df['week_of_month'] = df['day'].apply(lambda x: (x-1)//7 + 1)
    df['days_in_month'] = df['policy_start'].dt.days_in_month
    df['days_remaining_in_month'] = df['days_in_month'] - df['day']

    # Seasonal mapping
    season_map = {12:'winter', 1:'winter', 2:'winter',
                  3:'spring', 4:'spring', 5:'spring',
                  6:'summer', 7:'summer', 8:'summer',
                  9:'fall', 10:'fall', 11:'fall'}
    df['season'] = df['month'].map(season_map)

    return df

In [19]:
def create_advanced_features(df, is_training=True):
    """
    Create advanced features for insurance premium prediction with proper scaling
    """
    df = df.copy()

    # Store scaling factors during training
    if is_training:
        global scale_params
        scale_params = {
            'health_score_mean': df['health_score'].mean(),
            'health_score_std': df['health_score'].std(),
            'credit_score_mean': df['credit_score'].mean(),
            'credit_score_std': df['credit_score'].std(),
            'customer_feedback_map': {
                'Poor': 0.0,    # Higher risk
                'Average': 0.5, # Medium risk
                'Good': 1.0     # Lower risk
            },
            'exercise_frequency_map': {
                'Rarely': 0.0,   # Highest risk
                'Monthly': 0.33, # High risk
                'Weekly': 0.66,  # Low risk
                'Daily': 1.0     # Lowest risk
            },
            'smoking_map': {
                'Yes': 1.0,  # Higher risk
                'No': 0.0    # Lower risk
            },
            'marital_risk_map': {
                'Single': 1.0,    # Base risk
                'Married': 0.8,   # Lower risk (shared responsibility)
                'Divorced': 1.2   # Higher risk (potentially more financial stress)
            },
            'property_risk_map': {
                'Apartment': 1.0,  # Base risk
                'House': 1.5,     # Higher risk (more value/larger space)
                'Condo': 1.2      # Medium risk
            }
        }

    # 1. Date-based features
    df = create_date_features(df)

    # 2. Income-based features with proper scaling
    df['income_per_dependent'] = df['annual_income'] / (df['number_of_dependents'] + 1)
    df['income_bracket'] = pd.qcut(df['annual_income'], q=5,
                                 labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

    # 3. Risk Score Combinations with standardization
    # Standardize health and credit scores
    df['health_score_std'] = (df['health_score'] - scale_params['health_score_mean']) / scale_params['health_score_std']
    df['credit_score_std'] = (df['credit_score'] - scale_params['credit_score_mean']) / scale_params['credit_score_std']

    # Combined risk score (now both features are on same scale)
    df['total_risk_score'] = df['health_score_std'] + df['credit_score_std']

    # Claims ratio with insurance duration
    df['claims_to_duration_ratio'] = df['previous_claims'] / (df['insurance_duration'] + 1)

    # 4. Age-related interactions
    df['vehicle_to_driver_age_ratio'] = df['vehicle_age'] / df['age']
    df['is_young_driver'] = (df['age'] < 25).astype(int)
    df['is_senior_driver'] = (df['age'] > 65).astype(int)

    # 5. Lifestyle Score (normalized to 0-1 range)
    df['exercise_score'] = df['exercise_frequency'].map(scale_params['exercise_frequency_map'])
    df['smoking_risk'] = df['smoking_status'].map(scale_params['smoking_map'])
    df['lifestyle_score'] = (
        df['exercise_score'] * 0.4 +    # Exercise has significant impact
        (1 - df['smoking_risk']) * 0.4 + # Non-smoking is positive
        (df['health_score_std'] > 0) * 0.2  # Above average health is positive
    )

    # 6. Location-based features
    if is_training:
        scale_params['location_risk_map'] = df.groupby('location')['previous_claims'].mean()
        scale_params['location_credit_map'] = df.groupby('location')['credit_score'].mean()

    df['location_risk'] = df['location'].map(scale_params['location_risk_map'])
    df['location_avg_credit'] = df['location'].map(scale_params['location_credit_map'])

    # 7. Complex Interaction Features
    df['customer_feedback_score'] = df['customer_feedback'].map(scale_params['customer_feedback_map'])

    # Weighted responsibility score (all components now 0-1 scaled)
    df['responsibility_score'] = (
        df['credit_score_std'].clip(-3, 3) * 0.4 +  # Limit outlier effect
        df['customer_feedback_score'] * 0.3 +
        (1 - df['claims_to_duration_ratio'].clip(0, 1)) * 0.3  # Lower claims is better
    )

    # 8. Family and Property Risk
    df['marital_risk'] = df['marital_status'].map(scale_params['marital_risk_map'])
    df['property_risk'] = df['property_type'].map(scale_params['property_risk_map'])

    # Combined risk factors
    df['family_risk_factor'] = df['marital_risk'] * (df['number_of_dependents'] + 1)
    df['asset_risk'] = (
        df['property_risk'] * 0.6 +
        (df['vehicle_age'] / df['vehicle_age'].max()) * 0.4  # Normalized vehicle age
    )

    # 9. Customer Segment Features
    df['premium_segment'] = 'Standard'
    mask_premium = (
        (df['credit_score_std'] > 1) &  # Above 1 std in credit
        (df['previous_claims'] == 0) &   # No claims
        (df['health_score_std'] > 1)     # Above 1 std in health
    )
    mask_high_risk = (
        (df['credit_score_std'] < -1) |  # Below 1 std in credit
        (df['previous_claims'] > 3)       # Multiple claims
    )

    df.loc[mask_premium, 'premium_segment'] = 'Premium'
    df.loc[mask_high_risk, 'premium_segment'] = 'High Risk'

    # 10. Additional Ratio Features
    df['claims_per_year'] = df['previous_claims'] / (df['insurance_duration'] + 1)
    df['dependent_income_ratio'] = df['number_of_dependents'] / df['annual_income']

    # Drop intermediate columns
    intermediate_cols = ['health_score_std', 'credit_score_std', 'exercise_score',
                        'smoking_risk', 'customer_feedback_score', 'marital_risk',
                        'property_risk']
    df = df.drop(columns=[col for col in intermediate_cols if col in df.columns])

    return df

In [20]:
try:
    # Transform training data
    train_transformed = create_advanced_features(train, is_training=True)
    print("Training data transformation successful!")

    # Transform test data
    test_transformed = create_advanced_features(test, is_training=False)
    print("Test data transformation successful!")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    print("Please check your data types and column names.")

Training data transformation successful!
Test data transformation successful!


In [21]:
# Lets do one more transformation + oofs for stacking
train_transformed['premium_amount_log'] = np.log1p(train_transformed['premium_amount'])
train_transformed.drop(columns="premium_amount", inplace=True)
train_transformed[['4log_WeightedEnsemble_L4', '12nonlog_WeightedEnsemble_L3']] = train_oofs.set_index("id")[['4log_WeightedEnsemble_L4', '12nonlog_WeightedEnsemble_L3']]
test_transformed[['4log_WeightedEnsemble_L4', '12nonlog_WeightedEnsemble_L3']] = test_oofs.set_index("id")[['4log_WeightedEnsemble_L4', '12nonlog_WeightedEnsemble_L3']]

# **Autogluon training**

In [None]:
# Create the AutoGluon scorer using sklearn's implementation
#rmsle_scorer = make_scorer(
#    name='rmsle',
#    score_func=sklearn.metrics.root_mean_squared_log_error,
#    optimum=0,
#    greater_is_better=False,
#    needs_pred=True
#)

In [None]:
# Setting up
eval_metric = 'rmse'
label = 'premium_amount_log'
problem_type='regression'
excluded_model_types = ['KNN', 'RF', 'FASTAI']
hours = 12

# Initialize the TabularPredictor
predictor = TabularPredictor(label=label,
                             problem_type=problem_type,
                             eval_metric=eval_metric,
                             path = "/kaggle/working/Autogluon/202412_ps4s12_12hr_training_oofs")

# Fit the model
predictor.fit(train_data=train_transformed,
              time_limit=3600*hours,
              presets="best_quality",
              excluded_model_types=excluded_model_types,
              num_bag_folds=5,
              num_bag_sets=2,
              ag_args_fit={'num_gpus': 1}
)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          8
Memory Avail:       46.03 GB / 50.99 GB (90.3%)
Disk Space Avail:   201.58 GB / 235.68 GB (85.5%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=5, num_bag_sets=2
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 10800s of 

# **Leaderboard and Submission**

In [None]:
leaderboard_test = predictor.leaderboard(silent=True)
leaderboard_test

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-1.044502,root_mean_squared_error,142.252649,2697.468294,0.015908,3.100807,2,True,22
1,LightGBM_BAG_L1,-1.044563,root_mean_squared_error,8.727121,258.000325,8.727121,258.000325,1,True,2
2,LightGBM_r131_BAG_L1,-1.044581,root_mean_squared_error,68.917408,831.18048,68.917408,831.18048,1,True,7
3,CatBoost_r50_BAG_L1,-1.044589,root_mean_squared_error,3.437553,264.182947,3.437553,264.182947,1,True,15
4,CatBoost_r70_BAG_L1,-1.044595,root_mean_squared_error,5.083509,276.157853,5.083509,276.157853,1,True,20
5,CatBoost_r9_BAG_L1,-1.044651,root_mean_squared_error,6.575855,289.730436,6.575855,289.730436,1,True,8
6,CatBoost_BAG_L1,-1.044652,root_mean_squared_error,2.274065,548.236788,2.274065,548.236788,1,True,3
7,CatBoost_r13_BAG_L1,-1.044652,root_mean_squared_error,2.260915,369.959164,2.260915,369.959164,1,True,12
8,CatBoost_r177_BAG_L1,-1.044654,root_mean_squared_error,1.612325,214.641087,1.612325,214.641087,1,True,6
9,CatBoost_r69_BAG_L1,-1.04466,root_mean_squared_error,2.096287,450.950129,2.096287,450.950129,1,True,18


In [None]:
i = 0
models = leaderboard_test['model'].to_list()
best_model = models[i]
print("Best model by autogluon is", models[i])
print("With a score of RMSLE", np.abs(leaderboard_test[leaderboard_test['model']==best_model]['score_val'][i]))

training = "8hr_log_gpu"
sub_autogluon = submission.copy()
sub_autogluon['premium_amount_log'] = predictor.predict(test_transformed, as_pandas=False, model=best_model)
sub_autogluon['Premium Amount'] = np.expm1(sub_autogluon['premium_amount_log'])
sub_autogluon.drop(columns="premium_amount_log", inplace=True)
sub_autogluon.to_csv(f'/kaggle/working/submission.csv', index=False)

Best model by autogluon is WeightedEnsemble_L2
With a score of RMSLE 1.0445016491259669


AttributeError: 'super' object has no attribute '__sklearn_tags__'

# **Saving OOFs**

In [None]:
def save_experiment_oofs(predictor, models, experiment_name, path, islog=True):
    """
    Save OOF predictions as a single DataFrame with experiment identifier in column names
    """
    # Create DataFrame with index from training data
    oof_df = pd.DataFrame(index=predictor.predict_oof().index)

    # Add OOF predictions for each model with experiment identifier
    for i, model in enumerate(models, 1):
        print(f"Generating OOF predictions for {model} - {i}/{len(models)}")
        oof_preds = predictor.predict_oof(model=model)
        if islog:
            oof_preds = np.power(10, oof_preds)
        col_name = f"{experiment_name}_{model}"
        oof_df[col_name] = oof_preds

    # Save DataFrame
    filename = f"oof_preds_{experiment_name}.parquet"
    filepath = os.path.join(path, filename)

    oof_df.to_parquet(filepath)
    print(f"Saved {len(models)} model predictions for experiment {experiment_name}")
    return oof_df

# Saving OOFs for later use
oofs_path = "/kaggle/working/"
models = leaderboard_test['model'].to_list()
experiment_name = "8log"
oof_df = save_experiment_oofs(predictor, models, experiment_name, oofs_path)

In [None]:
top_models = leaderboard_test['model'].head(5).to_list()
for i, model in enumerate(leaderboard_test['model'].head().to_list()):
    print("\nModel by autogluon is", model)
    print("With a score of RMSLE", np.abs(leaderboard_test[leaderboard_test['model']==model]['score_val'][i]))

    training = "8hr_log_gpu"
    sub_autogluon = submission.copy()
    sub_autogluon['premium_amount_log'] = predictor.predict(test_transformed, as_pandas=False, model=best_model)
    sub_autogluon['Premium Amount'] = np.expm1(sub_autogluon['premium_amount_log'])
    sub_autogluon.drop(columns="premium_amount_log", inplace=True)
    sub_autogluon.to_csv(f'/kaggle/working/sub_{model}_{training}.csv', index=False)