# 1. Modelling Functions

## 1.1 - Build Model

In [0]:
def build_models(input_data, valuation_date, development_period_end = 24, triangle_groups = ['channel', 'claim_type'], n_periods = 12):
    # Suppress all warnings
    warnings.filterwarnings("ignore")

    # 1. Data Processing
    # =================================

    data_hidden = input_data[
        (input_data['obs_month'] <= valuation_date) &
        (input_data['acc_month'] <= valuation_date)
    ] 

    data_full = input_data.copy()

    # Create triangle on partial data
    triangle_combined = cl.Triangle(
        data_hidden,
        origin="acc_month",
        development="obs_month",
        columns=[
            'claim_count',
            'net_claim_incurred',
            'gross_claim_incurred',
            'net_claim_incurred_indexed',
            'gross_claim_incurred_indexed',
            'recoveries_indexed',
            'earnprem_indexed',
            'exposure',
            'recoveries',
            'earnprem'
        ],
        index=triangle_groups,
        cumulative=False
    ).incr_to_cum()

    # Transformations
    triangle_combined['frequency'] = triangle_combined['claim_count'] / triangle_combined['exposure']
    triangle_combined['gross_cost_per_policy'] = triangle_combined['gross_claim_incurred'] / triangle_combined['exposure']
    triangle_combined['gross_cost_per_policy_indexed'] = triangle_combined['gross_claim_incurred_indexed'] / triangle_combined['exposure']
    triangle_combined['net_cost_per_policy'] = triangle_combined['net_claim_incurred'] / triangle_combined['exposure']
    triangle_combined['net_cost_per_policy_indexed'] = triangle_combined['net_claim_incurred_indexed'] / triangle_combined['exposure']
    triangle_combined['net_loss_ratio'] = triangle_combined['net_claim_incurred'] / triangle_combined['earnprem']
    triangle_combined['net_loss_ratio_indexed'] = triangle_combined['net_claim_incurred_indexed'] / triangle_combined['earnprem_indexed']

    # ------------------------------------
    # FULL TRIANGLE (FOR ACTUAL RESULTS)
    # ------------------------------------
    triangle_combined_full = cl.Triangle(
        data_full,
        origin='acc_month',
        development='obs_month',
        columns=[
            'claim_count',
            'net_claim_incurred',
            'gross_claim_incurred',
            'net_claim_incurred_indexed',
            'gross_claim_incurred_indexed',
            'recoveries_indexed',
            'earnprem_indexed',
            'exposure',
            'recoveries',
            'earnprem'
        ],
        index=triangle_groups,
        cumulative=False,
    ).incr_to_cum()

    output_actual_results = triangle_combined_full[
        [
            "net_claim_incurred",
            "gross_claim_incurred",
            "claim_count",
            "net_claim_incurred_indexed",
            "gross_claim_incurred_indexed",
            "recoveries_indexed",
        ]
    ].latest_diagonal.to_frame().reset_index().drop(
        columns=['valuation'] # Drop valuation and manually append after
    ).rename(
        columns={
            'origin' : 'acc_month',
            'claim_count': 'latest_view_claim_count',
            'net_claim_incurred': 'latest_view_net_claim_incurred',
            'gross_claim_incurred': 'latest_view_gross_claim_incurred',
            'recoveries': 'latest_view_recoveries',
            'net_claim_incurred_indexed': 'latest_view_net_claim_incurred_indexed',
            'gross_claim_incurred_indexed': 'latest_view_gross_claim_incurred_indexed',
            'recoveries_indexed': 'latest_view_recoveries_indexed'
        }
    )

    # 2.1 Model Training - Claim Count
    # =================================

    # Build initial development triangle and replace the development factors to assume fully developed in {development_period_end} periods
    claim_count_development_factors  = cl.Development(n_periods=n_periods).fit_transform(triangle_combined[['claim_count','frequency']])
    for i in range(0, claim_count_development_factors.ldf_.values.shape[0]):
        claim_count_development_factors.ldf_.values[i][:,0,development_period_end:] = 1
        claim_count_development_factors.cdf_.values[i][:,0,development_period_end:] = 1

    # ------------------------------------
    # MODEL 1 - Chainladder
    # ------------------------------------
    claim_count_chainladder = cl.Chainladder().fit(claim_count_development_factors)

    weights = triangle_combined['exposure'].latest_diagonal
    weights /= np.sum(weights, axis=2, keepdims=True)

    apriori_claim_count = np.sum((claim_count_chainladder.ultimate_['frequency'] * weights).iloc[:, :, -12:, :], axis=2, keepdims=True) / np.sum(weights.iloc[:, :, -12:, :], axis=2, keepdims=True)

    # ------------------------------------
    # MODEL 2 - Bornhuetter-Ferguson
    # ------------------------------------
    claim_count_bf = cl.BornhuetterFerguson(
        apriori=  apriori_claim_count
    ).fit(
        triangle_combined[['claim_count','exposure']]
        , sample_weight = triangle_combined['exposure'].latest_diagonal
    )

    # ------------------------------------
    # MODEL 3 - Cape Cod
    # ------------------------------------
    claim_count_cc = cl.CapeCod().fit(
        triangle_combined[['claim_count','exposure']],
        sample_weight=triangle_combined['exposure'].latest_diagonal
    )

    output_triangle = triangle_combined[
        [
            "exposure",
            "earnprem",
            "earnprem_indexed",
            "claim_count",
            "net_claim_incurred",
            "gross_claim_incurred",
            "recoveries",
            "net_claim_incurred_indexed",
            "gross_claim_incurred_indexed",
            "recoveries_indexed",
        ]
    ].latest_diagonal.to_frame().reset_index().drop(
        columns=['valuation']
    ).rename(
        columns={
            'origin' : 'acc_month',
            'claim_count': 'reported_to_date_claim_count',
            'net_claim_incurred': 'reported_to_date_net_claim_incurred',
            'gross_claim_incurred': 'reported_to_date_gross_claim_incurred',
            'recoveries': 'reported_to_date_recoveries',
            'net_claim_incurred_indexed': 'reported_to_date_net_claim_incurred_indexed',
            'gross_claim_incurred_indexed': 'reported_to_date_gross_claim_incurred_indexed',
            'recoveries_indexed': 'reported_to_date_recoveries_indexed'
        }
    )

    # 2.2 Model Training - Net Incurred
    # =================================

    # Build initial development triangle and replace the development factors to assume fully developed in {development_period_end} periods
    net_incurred_development_factors  = cl.Development(n_periods=n_periods).fit_transform(triangle_combined[['net_claim_incurred','net_cost_per_policy','net_loss_ratio']])
    for i in range(0, net_incurred_development_factors.ldf_.values.shape[0]):
        net_incurred_development_factors.ldf_.values[i][:,0,development_period_end:] = 1
        net_incurred_development_factors.cdf_.values[i][:,0,development_period_end:] = 1


    # ------------------------------------
    # MODEL 1 - Chainladder
    # ------------------------------------
    
    # Fit a chainladder model using the adjusted development factors from dev object
    net_incurred_chainladder = cl.Chainladder().fit(net_incurred_development_factors)

    weights = triangle_combined['earnprem'].latest_diagonal
    weights /= np.sum(weights, axis=2, keepdims=True)

    apriori_net_incurred = np.sum((net_incurred_chainladder.ultimate_['net_loss_ratio'] * weights).iloc[:, :, -12:, :], axis=2, keepdims=True) / np.sum(weights.iloc[:, :, -12:, :], axis=2, keepdims=True)

    # ------------------------------------
    # MODEL 2 - Bornhuetter-Ferguson
    # ------------------------------------

    net_incurred_bf = cl.BornhuetterFerguson(
        apriori=apriori_net_incurred
    ).fit(
        net_incurred_development_factors, 
        sample_weight=triangle_combined['earnprem'].latest_diagonal
    )

    # ------------------------------------
    # MODEL 3 - Cape Cod
    # ------------------------------------

    net_incurred_cc = cl.CapeCod().fit(
        net_incurred_development_factors,  # Use the same dev factors!
        sample_weight=triangle_combined['earnprem'].latest_diagonal
    )

    # 3. Joins to Final Dataframe
    # =================================

    # ------------------------------------
    # FULL TRIANGLE - ACTUAL RESULTS
    # ------------------------------------

    output_results = pd.merge(
        output_triangle,
        output_actual_results,
        left_on  = triangle_groups + ['acc_month'],
        right_on =  triangle_groups + ['acc_month'],
        how='left'
    ).drop(
        columns=['exposure']
    )

    # ------------------------------------
    # COUNT - CHAIN LADDER
    # ------------------------------------
    output_claim_count_chainladder = claim_count_chainladder.ultimate_.to_frame().reset_index(
    ).rename(
        columns={
            'claim_count': 'ultimate_claim_count_chainladder'
            ,'origin' : 'acc_month'
        }
    )

    output_results = pd.merge(
        output_results,
        output_claim_count_chainladder,
        left_on  = triangle_groups + ['acc_month'],
        right_on =  triangle_groups + ['acc_month'],
        how='outer'
    )

    # ------------------------------------
    # NET INCURRED - CHAIN LADDER
    # ------------------------------------

    output_net_incurred_chainladder = net_incurred_chainladder.ultimate_.to_frame().reset_index().drop(
        columns=['valuation', 'net_cost_per_policy']
    ).rename(
        columns={
            'net_claim_incurred': 'ultimate_net_incurred_chainladder',
            'origin': 'acc_month'
        }
    )

    output_results = pd.merge(
        output_results,
        output_net_incurred_chainladder,
        left_on = triangle_groups + ['acc_month'],
        right_on = triangle_groups + ['acc_month'],
        how='outer'
    )

    # ------------------------------------
    # COUNT - BORNHUETTER-FERGUSON
    # ------------------------------------
    output_claim_count_bf = claim_count_bf.ultimate_.to_frame().reset_index(
    ).rename(
        columns={
            'claim_count': 'ultimate_claim_count_bf'
            ,'origin' : 'acc_month'
        }
    )

    output_results = pd.merge(
        output_results,
        output_claim_count_bf,
        left_on  = triangle_groups + ['acc_month'],
        right_on =  triangle_groups + ['acc_month'],
        how='outer'
    )

    # ------------------------------------
    # NET INCURRED - BORNHUETTER-FERGUSON
    # ------------------------------------

    output_net_incurred_bf = net_incurred_bf.ultimate_.to_frame().reset_index().drop(
        columns=['valuation', 'net_cost_per_policy', 'net_loss_ratio']
    ).rename(
        columns={
            'net_claim_incurred': 'ultimate_net_incurred_bf',
            'origin': 'acc_month'
        }
    )

    output_results = pd.merge(
        output_results,
        output_net_incurred_bf,
        left_on = triangle_groups + ['acc_month'],
        right_on = triangle_groups + ['acc_month'],
        how='outer'
    )

    # ------------------------------------
    # COUNT - CAPE COD
    # ------------------------------------
    output_claim_count_cc = claim_count_cc.ultimate_.to_frame().reset_index().rename(
        columns={
            'claim_count': 'ultimate_claim_count_cc',
            'origin': 'acc_month'
        }
    )

    output_results = pd.merge(
        output_results,
        output_claim_count_cc,
        left_on=triangle_groups + ['acc_month'],
        right_on=triangle_groups + ['acc_month'],
        how='outer'
    )

    # ------------------------------------
    # NET INCURRED - CAPE COD
    # ------------------------------------

    output_net_incurred_cc = net_incurred_cc.ultimate_.to_frame().reset_index().drop(
        columns=['valuation', 'net_cost_per_policy', 'net_loss_ratio']
    ).rename(
        columns={
            'net_claim_incurred': 'ultimate_net_incurred_cc',
            'origin': 'acc_month'
        }
    )

    output_results = pd.merge(
        output_results,
        output_net_incurred_cc,
        left_on=triangle_groups + ['acc_month'],
        right_on=triangle_groups + ['acc_month'],
        how='outer'
    )
    
    # Final Output
    output_results['valuation_date'] = pd.to_datetime(valuation_date)

    # Aggregate required columns from the original data
    additional_fields = input_data.groupby(triangle_groups + ['acc_month'], as_index=False).agg({
        'product_group': 'first'  # Assuming 'product' is constant within each group
    })

    # Merge into the final output
    output_results = pd.merge(
        output_results,
        additional_fields,
        on=triangle_groups + ['acc_month'],
        how='left'
    )

    return output_results

## 1.2 - Parallel Runs

In [0]:
def parallel_runs(input_data, triangle_groups, valuation_dates):
    # Run in parallel
    valuation_data = Parallel(n_jobs=-1)(delayed(
        build_models
    )(
        input_data,
        date,
        development_period_end,
        triangle_groups,
        n_periods
    ) for date in tqdm(valuation_dates, desc="Processing valuation dates"))

    # Combine results into a single DataFrame
    result = pd.concat(valuation_data, ignore_index=True)
    
    return result

# 2. Transformation Functions

## 2.1 - Post-shaping

In [0]:
def reshape_forecast_output(
    df: pd.DataFrame,
    response_prefixes: dict = {
        'ultimate_claim_count': {
            'response_type': 'count',
            'actual_col': 'latest_view_claim_count',
            'reported_col': 'reported_to_date_claim_count'
        },
        'ultimate_net_incurred': {
            'response_type': 'net_incurred',
            'actual_col': 'latest_view_net_claim_incurred',
            'reported_col': 'reported_to_date_net_claim_incurred'
        }
    },
    id_columns: list = ['acc_month', 'valuation_date', 'channel', 'claim_type']
) -> pd.DataFrame:
    """
    Reshapes ultimate model output DataFrame into long format with actual and predicted values.
    Handles both claim count and net incurred amounts.

    Parameters:
    - df: Input DataFrame.
    - response_prefixes: Dictionary mapping response prefixes to their metadata including:
        - response_type: Label for type of response ('count', 'net_incurred', etc.)
        - actual_col: Column name representing actual value
        - reported_col: Column name representing reported-to-date value
    - id_columns: List of identifying columns to retain (e.g. ['acc_month', 'valuation_date', ...]).

    Returns:
    - A tidy DataFrame with columns: id_columns + ['model', 'actual', 'predicted', 'latest_view_*', 'reported_to_date_*', 'response']
    """
    result_dfs = []

    # Process each response prefix type
    for response_prefix, config in response_prefixes.items():
        response_type = config['response_type']
        actual_col = config['actual_col']
        reported_col = config['reported_col']
        
        # Skip if required columns are not in the DataFrame
        if actual_col not in df.columns or reported_col not in df.columns:
            continue
        
        # Identify ultimate columns for the given response prefix
        ultimate_cols = [col for col in df.columns if col.startswith(response_prefix)]
        
        if not ultimate_cols:
            continue
            
        models = [col.split('_')[-1] for col in ultimate_cols]

        for model, col_name in zip(models, ultimate_cols):
            # Make sure all required columns exist before proceeding
            required_cols = id_columns + [col_name, actual_col, reported_col]
            if not all(col in df.columns for col in required_cols):
                continue
                
            model_df = df[required_cols].copy()

            model_df['model'] = model
            model_df['actual'] = model_df[actual_col] - model_df[reported_col]
            model_df['predicted'] = model_df[col_name] - model_df[reported_col]
            model_df['response'] = response_type

            # Rename columns to standardised names to ensure they're consistent
            model_df = model_df.rename(columns={
                actual_col: 'latest_view',
                reported_col: 'reported_to_date'
            })

            # Drop the original ultimate column
            model_df = model_df.drop(columns=[col_name])
            result_dfs.append(model_df)

    # If no results were found, return empty DataFrame with correct columns
    if not result_dfs:
        return pd.DataFrame(columns=id_columns + ['model', 'actual', 'predicted', 'latest_view', 'reported_to_date', 'response'])
    
    # Combine all results
    result = pd.concat(result_dfs, ignore_index=True)
    
    # Ensure consistent column order
    result = result[id_columns + ['model', 'actual', 'predicted', 'latest_view', 'reported_to_date', 'response']]

    return result

## 2.2 - Indexation

In [0]:
def indexation(data, cpi, indexation_date, join_column, indexation_columns, suffix='_indexed', unindex=False):
    """
    Apply indexation (e.g., CPI adjustment) to specified columns in a dataset based on a reference date.

    Parameters:
    - data (pd.DataFrame): Input DataFrame containing the data to be indexed.
    - indexation_date (str or pd.Timestamp): Date used as the reference for indexation (e.g., valuation date).
    - join_column (str): Column in `data` representing time (e.g., transaction date) to merge with CPI data.
    - indexation_columns (list): List of column names in `data` to apply indexation to.
    - suffix (str, optional): Suffix to append to indexed column names. Defaults to '_indexed'.
    - unindex (bool, optional): If True, reverses indexation (divides by factor); if False, applies it (multiplies).
                                Defaults to False.

    Returns:
    - pd.DataFrame: DataFrame with new indexed columns and temporary columns removed.

    Raises:
    - ValueError: If columns with the `suffix` already exist in `data`.
    - Warning: If `unindex=True` and `suffix='_indexed'` (to avoid naming confusion).
    """

    # Create new column names by appending the suffix to the original column names
    indexation_columns1 = [col + suffix for col in indexation_columns]
    
    # Check if any of the new column names already exist in the DataFrame
    for col in indexation_columns1:
        if col in data.columns:
            raise ValueError(f"Column {col} already exists in data.")

    # Create a copy of the input DataFrame to avoid modifying the original
    data = data.copy()
    
    # Add a column for the indexation reference date (converted to datetime)
    data['cpi_valuation_quarter'] = pd.to_datetime(indexation_date)

    # Merge the data with CPI data based on the join_column (e.g., transaction date)
    # Uses merge_asof to match to the nearest CPI quarter
    data = pd.merge_asof(
        data.sort_values(join_column),              # Sort data by join_column (e.g., transaction date)
        cpi.sort_values('quarter'),                 # Sort CPI data by quarter (assumes 'cpi' is a global DataFrame)
        left_on=join_column,                        # Column in `data` to match
        right_on='quarter',                         # Column in `cpi` to match
        direction='nearest'                         # Match to the nearest quarter
    ).drop(columns=['quarter']).rename(columns={'cpi': 'cpi_txn'})            # Remove the redundant 'quarter' column from CPI, # Rename CPI column to indicate transaction CPI

    # Merge again to get CPI for the indexation_date (valuation date)
    data = pd.merge_asof(
        data.sort_values('cpi_valuation_quarter'),  # Sort by the valuation date column
        cpi.sort_values('quarter'),                 # Sort CPI data by quarter
        left_on='cpi_valuation_quarter',            # Match on valuation date
        right_on='quarter',                         # Match on CPI quarter
        direction='nearest'                         # Match to the nearest quarter
    ).drop(columns=['quarter']).rename(columns={'cpi': 'cpi_valuation'})  # Remove the redundant 'quarter' column  # Rename CPI column to indicate valuation CPI

    # Calculate the indexation factor based on whether we're indexing or unindexing
    if unindex:
        # If unindexing, divide transaction CPI by valuation CPI (reverse adjustment)
        data['indexation_factor'] = data['cpi_txn'] / data['cpi_valuation']
        # Warn if the default suffix '_indexed' is used with unindexing
        if suffix == '_indexed':
            raise Warning("Unindexing is enabled. Please change the suffix from '_indexed' to avoid confusion")
    else:
        # If indexing, divide valuation CPI by transaction CPI (standard adjustment)
        data['indexation_factor'] = data['cpi_valuation'] / data['cpi_txn']

    # Drop temporary columns used for calculation
    data.drop(columns=['cpi_txn', 'cpi_valuation', 'cpi_valuation_quarter'], inplace=True)

    # Apply the indexation factor to the specified columns and create new columns with the suffix
    data[indexation_columns1] = data[indexation_columns].multiply(data['indexation_factor'], axis=0)
    
    # Remove the indexation_factor column as it's no longer needed
    data.drop(columns=['indexation_factor'], inplace=True)

    # Return the modified DataFrame
    return data

# 3. Diagnostic Functions

## 3.1 - Plotting Function

In [0]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
import pandas as pd

import warnings

def plot_claims_AVE(output, mode='net_incurred', risk_class=None, channel=None, prem_class=None, claim_type=None, models=None):
    """
    Plot claims Average vs Expected with updated filtering options and MAE table.
    
    Parameters:
    -----------
    output : pandas DataFrame
        The dataset containing claims data
    mode : str, optional
        'count' or 'net_incurred', defaults to 'net_incurred'
    risk_class : str, optional
        Filter by specific risk class
    prem_class : str, optional
        Filter by specific premium class
    claim_type : str, optional
        Filter by specific claim type
    models : list, optional
        List of models to include
    """
    if mode not in ['count', 'net_incurred']:
        raise ValueError("Invalid mode. Available modes are: 'count', 'net_incurred'.")
    
    # Suppress warnings
    warnings.filterwarnings("ignore")
    
    # Filter data
    filtered_data = output[(output['response'] == mode)].copy()

    # Apply new filtering options
    if risk_class:
        filtered_data = filtered_data[filtered_data['risk_class'] == risk_class]
    if channel:
        filtered_data = filtered_data[filtered_data['channel'] == channel]
    if prem_class:
        filtered_data = filtered_data[filtered_data['prem_class'] == prem_class]
    if claim_type:
        filtered_data = filtered_data[filtered_data['claim_type'] == claim_type]
    if models:
        filtered_data = filtered_data[filtered_data['model'].isin(models)]
    
    # Convert balance month to datetime
    filtered_data['valuation_date'] = pd.to_datetime(filtered_data['valuation_date'])
    
    # Group by balance month and model, calculate sum of actual and predicted
    grouped_data = filtered_data.groupby(['valuation_date', 'model']).agg({
        'actual': 'sum',
        'predicted': 'sum'
    }).reset_index()
    
    # Extract actual values (taking first model's values for reference)
    if not grouped_data.empty and len(grouped_data['model'].unique()) > 0:
        first_model = grouped_data['model'].unique()[0]
        actual_values = grouped_data[grouped_data['model'] == first_model][['valuation_date', 'actual']]
    else:
        print("No data available for the selected filters.")
        return
    
    # Define cutoff date for MAE calculation (6 months from the latest date)
    latest_date = grouped_data['valuation_date'].max()
    cutoff_date = latest_date - pd.DateOffset(months=6)
    
    # Calculate MAE excluding last 6 months
    mae_results = {}
    for model in grouped_data['model'].unique():
        model_data = grouped_data[grouped_data['model'] == model]
        mae_data = model_data[model_data['valuation_date'] <= cutoff_date]
        if not mae_data.empty:
            mae = abs(mae_data['actual'] - mae_data['predicted']).mean()
            mae_results[model] = mae
    
    # Create a subplot with 2 rows - one for the chart and one for the table
    # Increase vertical spacing for more padding between chart and table
    fig = make_subplots(
        rows=2, 
        cols=1,
        row_heights=[0.75, 0.25],  # Adjusted to give more space to the table
        vertical_spacing=0.15,      # Increased for more padding
        specs=[[{"type": "scatter"}], [{"type": "table"}]]
    )
    
    # Add actual values trace to the first subplot
    fig.add_trace(
        go.Scatter(
            x=actual_values['valuation_date'], 
            y=actual_values['actual'], 
            mode='lines+markers', 
            name='Actual',
            line=dict(color='black', width=2),
            hovertemplate='%{x}<br>Actual: %{y:.2f}'
        ),
        row=1, col=1
    )
    
    # Define color mapping for consistent colors across models
    model_colors = {
        'chainladder': 'blue',
        'bornhuetter-ferguson': 'red',
        'cape-cod': 'green',
        'munich': 'purple'
    }
    
    # Add predicted values traces for each model to the first subplot
    for model in grouped_data['model'].unique():
        model_data = grouped_data[grouped_data['model'] == model]
        color = model_colors.get(model, None)  # Get color from mapping if available
        
        fig.add_trace(
            go.Scatter(
                x=model_data['valuation_date'], 
                y=model_data['predicted'], 
                mode='lines+markers', 
                name=f'ibnr - {model}',
                line=dict(color=color) if color else {},  # Apply color if defined
                hovertemplate='%{x}<br>Predicted: %{y:.2f}'
            ),
            row=1, col=1
        )
    
    # Build title with filtering information
    title_parts = [f'Claim {mode.capitalize()}: Actual vs Predicted']
    if risk_class:
        title_parts.append(f'Risk Class: {risk_class}')
    if channel:
        title_parts.append(f'Channel: {channel}')
    if prem_class:
        title_parts.append(f'Premium Class: {prem_class}')
    if claim_type:
        title_parts.append(f'Claim Type: {claim_type}')
    title = ' | '.join(title_parts)

    # Create a table for MAE results, rankings and filters
    # Sort models by MAE to determine ranking
    ranked_models = sorted([(model, mae) for model, mae in mae_results.items()], key=lambda x: x[1])
    
    # Create ordered lists for the table
    model_names = [model for model, _ in ranked_models]
    mae_values = [f"{mae:.2f}" for _, mae in ranked_models]
    # Create rankings (1 to n)
    rankings = [f"#{i+1}" for i in range(len(ranked_models))]
    
    # Prepare filter information for the table
    filter_names = []
    filter_values = []
    
    # Add mode to filter info
    filter_names.append("Mode")
    filter_values.append(mode)
    
    # Add other filters if they're set
    if risk_class:
        filter_names.append("Risk Class")
        filter_values.append(risk_class)
    if channel:
        filter_names.append("Channel")
        filter_values.append(channel)
    if prem_class:
        filter_names.append("Premium Class")
        filter_values.append(prem_class)
    if claim_type:
        filter_names.append("Claim Type")
        filter_values.append(claim_type)
    if models:
        filter_names.append("Models")
        filter_values.append(", ".join(models) if isinstance(models, list) else models)
    
    # Add cutoff date information
    filter_names.append("MAE Cutoff Date")
    filter_values.append(cutoff_date.strftime('%Y-%m-%d'))
    
    # Add MAE table to the second subplot - now including ranking column
    fig.add_trace(
        go.Table(
            header=dict(
                values=['Ranking', 'Model', 'MAE (excluding last 6 months)', 'Filter', 'Value'],
                fill_color='paleturquoise',
                align='left',
                font=dict(size=12)
            ),
            cells=dict(
                values=[
                    rankings + [""] * (len(filter_names) - len(rankings)) if len(filter_names) > len(rankings) else rankings,
                    model_names + [""] * (len(filter_names) - len(model_names)) if len(filter_names) > len(model_names) else model_names,
                    mae_values + [""] * (len(filter_names) - len(mae_values)) if len(filter_names) > len(mae_values) else mae_values,
                    filter_names + [""] * (len(model_names) - len(filter_names)) if len(model_names) > len(filter_names) else filter_names,
                    filter_values + [""] * (len(model_names) - len(filter_values)) if len(model_names) > len(filter_values) else filter_values
                ],
                fill_color=[
                    ['lavender'] * len(rankings) + ['white'] * (len(filter_names) - len(rankings)) if len(filter_names) > len(rankings) else ['lavender'] * len(rankings),
                    ['lavender'] * len(model_names) + ['white'] * (len(filter_names) - len(model_names)) if len(filter_names) > len(model_names) else ['lavender'] * len(model_names),
                    ['lavender'] * len(mae_values) + ['white'] * (len(filter_names) - len(mae_values)) if len(filter_names) > len(mae_values) else ['lavender'] * len(mae_values),
                    ['ghostwhite'] * len(filter_names) + ['white'] * (len(model_names) - len(filter_names)) if len(model_names) > len(filter_names) else ['ghostwhite'] * len(filter_names),
                    ['ghostwhite'] * len(filter_values) + ['white'] * (len(model_names) - len(filter_values)) if len(model_names) > len(filter_values) else ['ghostwhite'] * len(filter_values)
                ],
                align='left',
                font=dict(size=11)
            )
        ),
        row=2, col=1
    )
    
    # Add zero line to the first subplot
    min_date = actual_values['valuation_date'].min()
    max_date = actual_values['valuation_date'].max()
    
    fig.add_shape(
        type="line",
        x0=min_date,
        y0=0,
        x1=max_date,
        y1=0,
        line=dict(color="gray", width=1, dash="dash"),
        row=1, col=1
    )
    
    # Update layout with more height and adjusted margins
    fig.update_layout(
        title=title,
        height=800,  # Increased height to better accommodate the expanded table
        margin=dict(t=100, b=50, l=50, r=50),
        template='plotly_white',
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        )
    )
    
    # Update xaxis and yaxis for the first subplot
    fig.update_xaxes(
        title_text='Valuation Date',
        tickangle=-45,
        row=1, col=1
    )
    
    fig.update_yaxes(
        title_text=f'Claim {mode.capitalize()}',
        row=1, col=1
    )
    
    # Add annotation for best model at the bottom left of the chart
    if len(mae_results) > 1:
        best_model = ranked_models[0][0]
        best_mae = ranked_models[0][1]
        fig.add_annotation(
            x=0.07,  # Position at the bottom left (%)
            y=0.45,  # Position at the bottom left (%)
            xref="paper",
            yref="paper",
            text=f"Best model: {best_model} (MAE: {best_mae:.2f})",
            showarrow=False,
            font=dict(size=11, color="green"),
            align="left",
            bgcolor="rgba(255, 255, 255, 0.8)",  # Semi-transparent white background
            bordercolor="green",
            borderwidth=1,
            borderpad=4,
            xanchor="left",
            yanchor="bottom"
        )
    
    # fig.show()
    return fig

## 3.2 - Method Selection

In [0]:
def calculate_mae_excl_last_6_months(data, actual_col, pred_cols, groupby_cols, date_col='valuation_date'):
    results = []

    for keys, group in data.groupby(groupby_cols):
        # Ensure date is datetime
        group[date_col] = pd.to_datetime(group[date_col])

        # Aggregate actual and predicted per valuation_date
        agg_list = {'valuation_date': group[date_col].unique()}
        agg_df = pd.DataFrame({'valuation_date': group[date_col]})
        agg_df[actual_col] = group[actual_col]
        for col in pred_cols:
            agg_df[col] = group[col]

        agg_df = agg_df.groupby('valuation_date').sum().reset_index()

        # Define cutoff date
        cutoff_date = agg_df['valuation_date'].max() - pd.DateOffset(months=6)
        filtered_df = agg_df[agg_df['valuation_date'] <= cutoff_date]

        # Compute MAE per model
        mae_dict = {}
        for col in pred_cols:
            mae = np.abs(filtered_df[actual_col] - filtered_df[col]).mean()
            mae_dict[col] = mae

        if not mae_dict:
            continue  # Skip if no models were evaluated

        # Find best model
        best_model = min(mae_dict, key=mae_dict.get)

        # Save results
        results.append({
            **dict(zip(groupby_cols, keys if isinstance(keys, tuple) else [keys])),
            **{f'mae_{col}': val for col, val in mae_dict.items()},
            'best_model': best_model
        })

    return pd.DataFrame(results)


In [0]:
def get_best_method(input_df, metric, groupby_cols):

    if metric == 'claim_count':
        actual_col = 'latest_view_claim_count'
    else:
        actual_col = 'latest_view_net_claim_incurred'

    best_method = calculate_mae_excl_last_6_months(
        input_df,
        actual_col=actual_col,
        pred_cols=[
            'ultimate_' + metric + '_chainladder',
            'ultimate_' + metric + '_bf',
            'ultimate_' + metric + '_cc'
        ],
        groupby_cols=groupby_cols,
        date_col='valuation_date'
    )

    if metric == 'claim_count':
        best_method['response'] = 'count'
    else:
        best_method['response'] = 'net_incurred'

    best_method['model'] = best_method['best_model'].str.split("_").str[-1]

    return best_method

# 4. Model Loop

## 4.1 - Parameter Checks

## 4.2 - Compare Changes

## 4.3 - Loop through products

In [0]:
for i, product in enumerate(product_configs, start=1):
    print(f"\n=== [{i}/{len(product_configs)}] Processing Product: {product} ===")

    # 1. Setup
    # =================================
    print("     [1/8] Retrieving Configurations")
    curr_product_short = product_configs[product]["product_short"]
    curr_claim_data = product_configs[product]["claim_data"]
    curr_expo_data = product_configs[product]["expo_data"]
    curr_model_data = product_configs[product]["model_data"]
    curr_aggLevels = [product_configs[product]["main_level"]] + product_configs[product]["sub_levels"]

    # 2. Indexation
    # =================================
    print("     [2/8] Column Indexation")
    cols_to_index = ['earnprem', 'net_claim_incurred', 'gross_claim_incurred', 'recoveries']
    temp_df1 = curr_model_data.copy()
    temp_df1[cols_to_index] = temp_df1[cols_to_index].astype(float)
    
    # Read cpi file and format dataframe
    cpi_df = pd.read_csv(cpi_file_path)
    cpi_df['quarter'] = pd.to_datetime(cpi_df['quarter'], format="%d/%m/%Y")

    curr_indexed_model_data = indexation(temp_df1, cpi=cpi_df, indexation_date=last_day_previous_month, join_column='obs_month', indexation_columns=cols_to_index) 

    # 3. Modelling
    # =================================
    print("     [3/8] Modelling")
    combined_df = parallel_runs(input_data=curr_indexed_model_data, triangle_groups=curr_aggLevels, valuation_dates=valuation_dates)

    # 4. Reshape Output
    # =================================
    print("     [4/8] Reshaping Output")
    result = reshape_forecast_output(
        df=combined_df,
        response_prefixes={
            'ultimate_claim_count': {
                'response_type': 'count',
                'actual_col': 'latest_view_claim_count',
                'reported_col': 'reported_to_date_claim_count'
            },
            'ultimate_net_incurred': {
                'response_type': 'net_incurred',
                'actual_col': 'latest_view_net_claim_incurred',
                'reported_col': 'reported_to_date_net_claim_incurred'
            }
        },
        id_columns=['acc_month', 'valuation_date'] + curr_aggLevels
    )

    # 5. Best Method Selection
    # =================================
    print("     [5/8] Best Method Selection")
    claim_count_best_method = get_best_method(combined_df, 'claim_count', curr_aggLevels)
    net_incurred_best_method = get_best_method(combined_df, 'net_incurred', curr_aggLevels)

    # Union best method dataframes to be used for subsetting
    best_method_union = pd.concat([claim_count_best_method, net_incurred_best_method]).drop_duplicates().reset_index(drop=True)

    # Get best model results
    bm_results = result.merge(
        best_method_union[curr_aggLevels + ['model', 'response']],
        on=curr_aggLevels + ['model', 'response'],
        how='inner'
    )

    # Retrieve latest valuation date for selected models
    bm_results = bm_results[bm_results['valuation_date'] == latest_balance_date_str]

    # 6. Transformations
    # =================================
    print("     [6/8] Transformations")
    # Consolidation
    ultimates_pre = pd.merge(
        curr_claim_data.groupby(['acc_month'] + curr_aggLevels)[['claim_count', 'net_claim_incurred']].sum().reset_index(),
        bm_results,
        on=['acc_month'] + curr_aggLevels,
        how='outer'
    )

    # Merge with expo_data
    ultimates_pre = pd.merge(ultimates_pre, curr_expo_data, on=['acc_month'] + ['channel']) # + curr_aggLevels - TEMPORARY, USE CHANNEL ONLY, expo data should match granularity of aggregation

    # Fill missing values and convert to int64 for relevant columns
    columns_to_fix = ['net_claim_incurred', 'claim_count', 'predicted',  'exposure']
    ultimates_pre[columns_to_fix] = ultimates_pre[columns_to_fix].fillna(0).astype('int64')
    ultimates_pre['product'] = product

    # Pivoting
    pivot_indexes = ['acc_month', 'product', 'claim_count', 'net_claim_incurred', 'earnprem', 'exposure'] + curr_aggLevels

    # Pivoting predicted values
    predicted_pivot = ultimates_pre.pivot_table(index=pivot_indexes, 
                                    columns='response', 
                                    values='predicted').reset_index()
    predicted_pivot.columns.name = None
    predicted_pivot = predicted_pivot.rename(columns={
        'count': 'ibnr_count',
        'net_incurred': 'ibnr_incurred'
    })

    # Pivoting model values
    model_pivot = ultimates_pre.pivot_table(index=pivot_indexes, 
                                columns='response', 
                                values='model', aggfunc='first').reset_index()
    model_pivot.columns.name = None
    model_pivot = model_pivot.rename(columns={
        'count': 'count_model',
        'net_incurred': 'incurred_model'
    })

    # Merge the two pivoted DataFrames
    ultimates_df = pd.merge(predicted_pivot, model_pivot, on=pivot_indexes)

    # Create ultimate count and ultimate incurred columns
    ultimates_df['ultimate_count'] = ultimates_df['claim_count'] + ultimates_df['ibnr_count']
    ultimates_df['ultimate_incurred'] = ultimates_df['net_claim_incurred'] + ultimates_df['ibnr_incurred']

    # 7. Inflation Adjustment
    # =================================
    print("     [7/8] Inflation Adjustment")
    temp_df2 = ultimates_df.copy()

    # Add Quarter to Ultimates Table
    temp_df2['quarter'] = temp_df2['acc_month'].dt.to_period('Q').astype(str).str.replace('Q', 'Q', regex=False)

    # CPI
    ultimates_with_cpi = temp_df2.merge(cpi_by_quarter[['quarter', 'cpi']], on='quarter', how='left')

    # Base CPI
    base_cpi_quarter = sorted(ultimates_with_cpi['quarter'].unique())[-2] # Get second-last quarter in dataset
    base_cpi_value = cpi_by_quarter.loc[cpi_by_quarter['quarter'] == base_cpi_quarter, 'cpi'].values[0]
    ultimates_with_cpi['base_cpi'] = base_cpi_value

    # Index Multiplier
    ultimates_with_cpi['index_multiplier'] = ultimates_with_cpi['base_cpi'] / ultimates_with_cpi['cpi']

    # Adjusted Incurreds
    ultimates_with_cpi['adj_ultimate_incurred'] = ultimates_with_cpi['ultimate_incurred'] * ultimates_with_cpi['index_multiplier']
    ultimates_with_cpi['adj_net_claim_incurred'] = ultimates_with_cpi['net_claim_incurred'] * ultimates_with_cpi['index_multiplier']

    # 8. Output to Table
    # =================================
    print("     [8/8] Writing to databricks table")
    spark.createDataFrame(ultimates_with_cpi) \
        .write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable("actuaries_prd.general." + curr_product_short.lower() + "_ultimates_new")

    print(f"         - Saved to: actuaries_prd.general.{curr_product_short.lower()}_ultimates_new")