# **Autogluon 4hr log tpu**

In [None]:
%%capture
%pip install setuptools wheel autogluon.tabular[all,skex] dask[dataframe]
%pip install -U -q ipywidgets
%pip install -U scikit-learn

In [None]:
# Import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import cloudpickle
import plotly.io as pio
import plotly.graph_objects as go
from autogluon.core.metrics import make_scorer
import sklearn
from plotly.subplots import make_subplots
pd.options.plotting.backend = "plotly"
pio.templates.default = "simple_white"
warnings.filterwarnings('ignore')

# Import specific libraries
from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#base_path = os.getenv('DATA_FOLDER_PATH', 'Data/')
base_path = os.getenv('DATA_FOLDER_PATH', '/content/drive/MyDrive/DS_Projects/Playground_Series/Ps4e12_Regression_Insuranse_Premium_Prediction/Data/')

train = pd.read_csv(os.path.join(base_path, 'train.csv'))
test = pd.read_csv(os.path.join(base_path, 'test.csv'))
submission = pd.read_csv(os.path.join(base_path, 'sample_submission.csv'))
original = pd.read_csv(os.path.join(base_path, 'Insurance Premium Prediction Dataset.csv'))

In [None]:
train.set_index('id', inplace=True)
test.set_index('id', inplace=True)

# Renaming columns for consistency
train.columns = train.columns.str.lower()
test.columns = test.columns.str.lower()
original.columns = original.columns.str.lower()
train.columns = [col.replace(" ", "_") for col in train.columns]
test.columns = [col.replace(" ", "_") for col in test.columns]
original.columns = [col.replace(" ", "_") for col in original.columns]
original  = original[train.columns]
original = original.dropna(subset=['premium_amount'])

# **Feature Engineering**

In [None]:
def create_date_features(df):
    # Basic date features
    df['policy_start'] = pd.to_datetime(df['policy_start_date'])
    df['year'] = df['policy_start'].dt.year
    df['month'] = df['policy_start'].dt.month
    df['day'] = df['policy_start'].dt.day
    df['week_of_year'] = df['policy_start'].dt.isocalendar().week.astype('int')
    df['day_of_week'] = df['policy_start'].dt.day_name()
    df['month_name'] = df['policy_start'].dt.month_name()
    df['quarter'] = df['policy_start'].dt.quarter

    # Cyclical encoding
    for col, max_val in [('year', 1), ('month', 12), ('day', 31)]:
        df[f'{col}_sin'] = np.sin(2 * np.pi * df[col] / max_val)
        df[f'{col}_cos'] = np.cos(2 * np.pi * df[col] / max_val)

    # Binary flags
    df['is_weekend'] = df['policy_start'].dt.dayofweek.isin([5,6]).astype(int)
    df['is_month_end'] = df['policy_start'].dt.is_month_end.astype(int)
    df['is_month_start'] = df['policy_start'].dt.is_month_start.astype(int)
    df['is_quarter_end'] = df['policy_start'].dt.is_quarter_end.astype(int)
    df['is_quarter_start'] = df['policy_start'].dt.is_quarter_start.astype(int)

    # Time-based calculations
    df['policy_age_days'] = (df['policy_start'].max() - df['policy_start']).dt.days
    df['week_of_month'] = df['day'].apply(lambda x: (x-1)//7 + 1)
    df['days_in_month'] = df['policy_start'].dt.days_in_month
    df['days_remaining_in_month'] = df['days_in_month'] - df['day']

    # Seasonal mapping
    season_map = {12:'winter', 1:'winter', 2:'winter',
                  3:'spring', 4:'spring', 5:'spring',
                  6:'summer', 7:'summer', 8:'summer',
                  9:'fall', 10:'fall', 11:'fall'}
    df['season'] = df['month'].map(season_map)

    return df

In [None]:
def create_advanced_features(df, is_training=True):
    """
    Create advanced features for insurance premium prediction with proper scaling
    """
    df = df.copy()

    # Store scaling factors during training
    if is_training:
        global scale_params
        scale_params = {
            'health_score_mean': df['health_score'].mean(),
            'health_score_std': df['health_score'].std(),
            'credit_score_mean': df['credit_score'].mean(),
            'credit_score_std': df['credit_score'].std(),
            'customer_feedback_map': {
                'Poor': 0.0,    # Higher risk
                'Average': 0.5, # Medium risk
                'Good': 1.0     # Lower risk
            },
            'exercise_frequency_map': {
                'Rarely': 0.0,   # Highest risk
                'Monthly': 0.33, # High risk
                'Weekly': 0.66,  # Low risk
                'Daily': 1.0     # Lowest risk
            },
            'smoking_map': {
                'Yes': 1.0,  # Higher risk
                'No': 0.0    # Lower risk
            },
            'marital_risk_map': {
                'Single': 1.0,    # Base risk
                'Married': 0.8,   # Lower risk (shared responsibility)
                'Divorced': 1.2   # Higher risk (potentially more financial stress)
            },
            'property_risk_map': {
                'Apartment': 1.0,  # Base risk
                'House': 1.5,     # Higher risk (more value/larger space)
                'Condo': 1.2      # Medium risk
            }
        }

    # 1. Date-based features
    df = create_date_features(df)

    # 2. Income-based features with proper scaling
    df['income_per_dependent'] = df['annual_income'] / (df['number_of_dependents'] + 1)
    df['income_bracket'] = pd.qcut(df['annual_income'], q=5,
                                 labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

    # 3. Risk Score Combinations with standardization
    # Standardize health and credit scores
    df['health_score_std'] = (df['health_score'] - scale_params['health_score_mean']) / scale_params['health_score_std']
    df['credit_score_std'] = (df['credit_score'] - scale_params['credit_score_mean']) / scale_params['credit_score_std']

    # Combined risk score (now both features are on same scale)
    df['total_risk_score'] = df['health_score_std'] + df['credit_score_std']

    # Claims ratio with insurance duration
    df['claims_to_duration_ratio'] = df['previous_claims'] / (df['insurance_duration'] + 1)

    # 4. Age-related interactions
    df['vehicle_to_driver_age_ratio'] = df['vehicle_age'] / df['age']
    df['is_young_driver'] = (df['age'] < 25).astype(int)
    df['is_senior_driver'] = (df['age'] > 65).astype(int)

    # 5. Lifestyle Score (normalized to 0-1 range)
    df['exercise_score'] = df['exercise_frequency'].map(scale_params['exercise_frequency_map'])
    df['smoking_risk'] = df['smoking_status'].map(scale_params['smoking_map'])
    df['lifestyle_score'] = (
        df['exercise_score'] * 0.4 +    # Exercise has significant impact
        (1 - df['smoking_risk']) * 0.4 + # Non-smoking is positive
        (df['health_score_std'] > 0) * 0.2  # Above average health is positive
    )

    # 6. Location-based features
    if is_training:
        scale_params['location_risk_map'] = df.groupby('location')['previous_claims'].mean()
        scale_params['location_credit_map'] = df.groupby('location')['credit_score'].mean()

    df['location_risk'] = df['location'].map(scale_params['location_risk_map'])
    df['location_avg_credit'] = df['location'].map(scale_params['location_credit_map'])

    # 7. Complex Interaction Features
    df['customer_feedback_score'] = df['customer_feedback'].map(scale_params['customer_feedback_map'])

    # Weighted responsibility score (all components now 0-1 scaled)
    df['responsibility_score'] = (
        df['credit_score_std'].clip(-3, 3) * 0.4 +  # Limit outlier effect
        df['customer_feedback_score'] * 0.3 +
        (1 - df['claims_to_duration_ratio'].clip(0, 1)) * 0.3  # Lower claims is better
    )

    # 8. Family and Property Risk
    df['marital_risk'] = df['marital_status'].map(scale_params['marital_risk_map'])
    df['property_risk'] = df['property_type'].map(scale_params['property_risk_map'])

    # Combined risk factors
    df['family_risk_factor'] = df['marital_risk'] * (df['number_of_dependents'] + 1)
    df['asset_risk'] = (
        df['property_risk'] * 0.6 +
        (df['vehicle_age'] / df['vehicle_age'].max()) * 0.4  # Normalized vehicle age
    )

    # 9. Customer Segment Features
    df['premium_segment'] = 'Standard'
    mask_premium = (
        (df['credit_score_std'] > 1) &  # Above 1 std in credit
        (df['previous_claims'] == 0) &   # No claims
        (df['health_score_std'] > 1)     # Above 1 std in health
    )
    mask_high_risk = (
        (df['credit_score_std'] < -1) |  # Below 1 std in credit
        (df['previous_claims'] > 3)       # Multiple claims
    )

    df.loc[mask_premium, 'premium_segment'] = 'Premium'
    df.loc[mask_high_risk, 'premium_segment'] = 'High Risk'

    # 10. Additional Ratio Features
    df['claims_per_year'] = df['previous_claims'] / (df['insurance_duration'] + 1)
    df['dependent_income_ratio'] = df['number_of_dependents'] / df['annual_income']

    # Drop intermediate columns
    intermediate_cols = ['health_score_std', 'credit_score_std', 'exercise_score',
                        'smoking_risk', 'customer_feedback_score', 'marital_risk',
                        'property_risk']
    df = df.drop(columns=[col for col in intermediate_cols if col in df.columns])

    return df

In [None]:
try:
    # Transform training data
    train_transformed = create_advanced_features(train, is_training=True)
    print("Training data transformation successful!")

    # Transform test data
    test_transformed = create_advanced_features(test, is_training=False)
    print("Test data transformation successful!")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    print("Please check your data types and column names.")

In [None]:
# Lets do one more transformation
train_transformed['premium_amount_log'] = np.log10(train_transformed['premium_amount'])

# **Autogluon Train**

In [None]:
# # Create the AutoGluon scorer using sklearn's implementation
# rmsle_scorer = make_scorer(
#     name='rmsle',
#     score_func=sklearn.metrics.root_mean_squared_log_error,
#     optimum=0,
#     greater_is_better=False,
#     needs_pred=True
# )

In [None]:
# Setting up
eval_metric = 'rmse'
label = 'premium_amount_log'
problem_type='regression'
hours = 4
autogluon_path = os.getenv('DATA_FOLDER_PATH', '/content/drive/MyDrive/DS_Projects/Playground_Series/Ps4e12_Regression_Insuranse_Premium_Prediction/Autogluon/')

# Initialize the TabularPredictor
predictor = TabularPredictor(label=label,
                             problem_type=problem_type,
                             eval_metric=eval_metric,
                             path = os.path.join(autogluon_path, "202412_ps4s12_4hr_training"))

# Fit the model
predictor.fit(train_data=train_transformed.drop(columns="premium_amount"),
              time_limit=3600*hours,
              presets="best_quality",
              num_stack_levels=2,
              dynamic_stacking=False
)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          96
Memory Avail:       326.30 GB / 334.56 GB (97.5%)
Disk Space Avail:   206.24 GB / 225.33 GB (91.5%)
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=2, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ... Time limit = 14400s
AutoGluon will save models to "/content/Autogluon/202412_ps4s12_4hr_training"
Train Data Rows:    1200000
Train Data Columns: 59
Label Column:       premium_amount_log
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    335051.20 MB
	Train Data (Original)  Memory Usage: 1465.39 MB (0.4% of available memory)
	Inferring data type of each feature based on

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7884da037820>

# **Best Submission**

In [None]:
predictor = TabularPredictor.load(os.path.join(autogluon_path, "202412_ps4s12_4hr_training"))

In [None]:
leaderboard_test = predictor.leaderboard(silent=True)
leaderboard_test

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L4,-0.456175,root_mean_squared_error,592.019967,12168.401034,0.018989,5.205663,4,True,39
1,WeightedEnsemble_L3,-0.456184,root_mean_squared_error,436.102431,8028.720071,0.016997,1.734774,3,True,28
2,LightGBM_r131_BAG_L3,-0.456222,root_mean_squared_error,573.205613,11032.514372,3.781716,40.444908,3,True,37
3,CatBoost_BAG_L3,-0.456222,root_mean_squared_error,570.072577,11290.144512,0.64868,298.075048,3,True,32
4,LightGBM_BAG_L3,-0.456225,root_mean_squared_error,570.893403,11005.059917,1.469506,12.990453,3,True,30
5,CatBoost_r177_BAG_L3,-0.456227,root_mean_squared_error,569.875735,11032.793932,0.451839,40.724468,3,True,36
6,CatBoost_BAG_L2,-0.456238,root_mean_squared_error,350.13126,6992.651333,0.706634,959.211804,2,True,19
7,LightGBM_r131_BAG_L2,-0.456242,root_mean_squared_error,354.036047,6090.532742,4.611421,57.093213,2,True,24
8,CatBoost_r9_BAG_L2,-0.456242,root_mean_squared_error,350.168216,6204.375225,0.74359,170.935696,2,True,26
9,NeuralNetFastAI_BAG_L3,-0.456245,root_mean_squared_error,584.303429,11790.138528,14.879533,798.069064,3,True,34


In [None]:
models = leaderboard_test.head(5)['model'].to_list()
best_model = models[0]
print("Best model by autogluon is", models[0])
print("With a score of RMSLE", np.abs(leaderboard_test[leaderboard_test['model']==best_model]['score_val'][0]))

training = "4hr_log_tpu"
sub_autogluon = submission.copy()
sub_autogluon['premium_amount_log'] = predictor.predict(test_transformed, as_pandas=False, model=best_model)
sub_autogluon['Premium Amount'] = np.power(10, sub_autogluon['premium_amount_log'])
sub_autogluon.drop(columns="premium_amount_log", inplace=True)
sub_autogluon.to_csv(os.path.join(base_path, f'submission_{best_model}_{training}.csv'), index=False)

Best model by autogluon is WeightedEnsemble_L4
With a score of RMSLE 0.45617460214772926


# **Saving OOFs**

In [None]:
def save_experiment_oofs(predictor, models, experiment_name, path, islog=True):
    """
    Save OOF predictions as a single DataFrame with experiment identifier in column names
    """
    # Create DataFrame with index from training data
    oof_df = pd.DataFrame(index=predictor.predict_oof().index)

    # Add OOF predictions for each model with experiment identifier
    for i, model in enumerate(models, 1):
        print(f"Generating OOF predictions for {model} - {i}/{len(models)}")
        oof_preds = predictor.predict_oof(model=model)
        if islog:
            oof_preds = np.power(10, oof_preds)
        col_name = f"{experiment_name}_{model}"
        oof_df[col_name] = oof_preds

    # Save DataFrame
    filename = f"oof_preds_{experiment_name}.parquet"
    filepath = os.path.join(path, filename)

    oof_df.to_parquet(filepath)
    print(f"Saved {len(models)} model predictions for experiment {experiment_name}")
    return oof_df

# Saving OOFs for later use
oofs_path = os.getenv('DATA_FOLDER_PATH', '/content/drive/MyDrive/DS_Projects/Playground_Series/Ps4e12_Regression_Insuranse_Premium_Prediction/Data/oofs/')
models = leaderboard_test['model'].to_list()
experiment_name = "4log"
oof_df = save_experiment_oofs(predictor, models, experiment_name, oofs_path)

Generating OOF predictions for WeightedEnsemble_L4 - 0/39
Generating OOF predictions for WeightedEnsemble_L3 - 1/39
Generating OOF predictions for LightGBM_r131_BAG_L3 - 2/39
Generating OOF predictions for CatBoost_BAG_L3 - 3/39
Generating OOF predictions for LightGBM_BAG_L3 - 4/39
Generating OOF predictions for CatBoost_r177_BAG_L3 - 5/39
Generating OOF predictions for CatBoost_BAG_L2 - 6/39
Generating OOF predictions for LightGBM_r131_BAG_L2 - 7/39
Generating OOF predictions for CatBoost_r9_BAG_L2 - 8/39
Generating OOF predictions for NeuralNetFastAI_BAG_L3 - 9/39
Generating OOF predictions for LightGBMXT_BAG_L3 - 10/39
Generating OOF predictions for LightGBM_BAG_L2 - 11/39
Generating OOF predictions for CatBoost_r177_BAG_L2 - 12/39
Generating OOF predictions for LightGBMLarge_BAG_L3 - 13/39
Generating OOF predictions for LightGBMLarge_BAG_L2 - 14/39
Generating OOF predictions for LightGBMXT_BAG_L2 - 15/39
Generating OOF predictions for LightGBM_r96_BAG_L2 - 16/39
Generating OOF pred