# 1. Global Setup

## 1.1 - Packages

### 1.1.1 - Installations

In [0]:
# Core Data Handling
%pip install pandas --quiet
%pip install numpy --quiet
%pip install openpyxl --quiet

# Modeling and Statistical Analysis
%pip install statsmodels --quiet
%pip install pygam --quiet

# Actuarial Modelling
%pip install chainladder --quiet
%pip install sparse==0.15.5 --quiet  # Newer version conflicts with chainladder

# Performance and Parallel Processing
%pip install swifter --quiet
%pip install joblib --quiet
%pip install tqdm --quiet


### 1.1.2 - Imports

In [0]:
# Core Libraries
import pandas as pd
import numpy as np
from decimal import Decimal
from datetime import datetime
import itertools
import warnings
from dateutil.relativedelta import relativedelta

# Visualisation
import matplotlib.pyplot as plt

# Statistical Modelling
import statsmodels.api as sm
import statsmodels.formula.api as smf
from pygam import PoissonGAM, GAM, s, f, te

# Actuarial Modelling
import chainladder as cl

# Performance and Parallel Processing
from tqdm import tqdm
from joblib import Parallel, delayed


### 1.1.3 - Global Parameters

In [0]:
# Modelling Parameters
development_term = 24
last_day_previous_month = datetime(2025, 5, 31)
last_n_month_lognormal = 12
latest_balance_date_str = last_day_previous_month.strftime('%Y-%m-%d')

exclude_last_n_month = 6
acc_month_start = pd.to_datetime('2017-01-01')
valuation_dates = pd.date_range(start="2019-01-31", end=last_day_previous_month - relativedelta(months=exclude_last_n_month), freq='ME').strftime('%Y-%m-%d').tolist()

# Indexation
cpi_file_path = '/Volumes/actuaries_prd/general/ibnr/enrichment/cpi.csv'

cpi_by_quarter = spark.sql(f"""
    SELECT 
        Date,
        CPI as cpi,
        CONCAT(YEAR(Date), 'Q', QUARTER(Date)) AS quarter
    FROM actuaries_prd.reference_data.abs_quarterly_cpi
""").toPandas()

In [0]:
# Collection for loop
product_configs = {}

# Function to add (if product does not exist) or update (if product exists) configs
def update_product_config(config_dict, product_name, input_product_short=None, input_claim_data=None, input_expo_data=None, input_gwp_data=None, input_model_data=None, input_main_level=None, input_sub_levels=None):
    # If product doesn't exist, initialise it
    if product_name not in config_dict:
        config_dict[product_name] = {}

    # Update only the parameters that are provided
    if input_product_short is not None:
        config_dict[product_name]["product_short"] = input_product_short
    if input_claim_data is not None:
        config_dict[product_name]["claim_data"] = input_claim_data
    if input_expo_data is not None:
        config_dict[product_name]["expo_data"] = input_expo_data
    if input_gwp_data is not None:
        config_dict[product_name]["gwp_data"] = input_gwp_data
    if input_model_data is not None:
        config_dict[product_name]["model_data"] = input_model_data
    if input_main_level is not None:
        config_dict[product_name]["main_level"] = input_main_level
    if input_sub_levels is not None:
        config_dict[product_name]["sub_levels"] = input_sub_levels

# Function to check product_configs
def print_product_configs_summary(config_dict):
    for product, config in config_dict.items():
        print(f"\n Product: {product}")
        for key, value in config.items():
            if isinstance(value, pd.DataFrame):
                print(f"  - {key}: DataFrame with shape {value.shape}")
            else:
                print(f"  - {key}: {value}")

# 2. Product

## 2.2 - Householders

### 2.2.1 - App Parameters

In [0]:
# Product Name
product = 'Householders'
product_short = 'HH'
folder_path = '' # path needs to be valid
# add version folder path for version management to product configurations

# Aggregation Levels ## Configuring for app parameters (This will impact the current adequacy summary table results.lji It will be aggregated to the main level)
main_level = 'channel' # or segment (could be None)
sub_levels = ['premcls', 'claim_type'] # (could be None), no aggregation but one adequacy result

### 2.2.2 - Claim Data for IBNR

In [0]:
claim_data_for_ibnr = spark.sql(f"""
    SELECT 
        DATE_TRUNC('MM', g.loss_date) AS acc_month, 
        GREATEST(DATE_TRUNC('MM', a.observation_year_month),DATE_TRUNC('MM', g.loss_date)) AS obs_month,
        greatest((YEAR(a.observation_year_month) - YEAR(g.loss_date)) * 12 + MONTH(a.observation_year_month) - MONTH(g.loss_date) + 1,1 ) AS dev_month,
        p.ClaimType as claim_type,
        g.ANZO_Super_Class AS product_group,
        CASE WHEN coalesce(g.cell_name, 'Direct') = 'Householders' THEN 'Direct' else coalesce(g.cell_name, 'Direct') END AS channel,
        -- PremClass as premcls,
        SUM(a.new_claims_count) AS claim_count, 
        SUM(a.net_claims_incurred_movement_amount_gst_excl) AS net_claim_incurred,
        SUM(a.gross_claims_incurred_movement_amount_gst_excl) as gross_claim_incurred,
        SUM(a.claim_recoveries_movement_amount_gst_excl) as recoveries
    FROM 
        cds_prd.cds.claim_claim_transactionmonth_financialcounts a 
    LEFT JOIN
        cds_prd.rds.claim_claim_transactiondaily_financialcounts_detail g on a.claim_fkey = g.claim_origin_key
    LEFT JOIN 
        ids_prd.ref.ref_cause_of_loss c ON a.cause_of_loss_fkey = c.origin_key
    LEFT JOIN 
        (select distinct PolicyNumber,ReferenceProductCode_Ext FROM staging_prd.gw.pc_policyperiod) h ON g.Policy_Number = h.PolicyNumber
    LEFT JOIN
        actuarial_onprem_sqlserver.dbo.dim_hh_claim_premcls_18072025 p ON g.Claim_Number = RIGHT(p.CLAIMNO, LEN(p.CLAIMNO) - 1)
    WHERE 
        YEAR(g.loss_date) >= 2017
        AND g.anzo_super_class = '{product}'
        AND a.observation_year_month <= '{latest_balance_date_str}'
        AND g.loss_date <= '{latest_balance_date_str}'
        AND c.incident_description is not null
    GROUP BY 
        all
    ORDER BY 
        all
    """).toPandas()

### 2.2.3 - Claim Data for Adequacy analysis

In [0]:
claim_data_for_adequacy_analysis = spark.sql(f"""
    SELECT 
        DATE_TRUNC('MM', g.loss_date) AS acc_month, 
        GREATEST(DATE_TRUNC('MM', a.observation_year_month),DATE_TRUNC('MM', g.loss_date)) AS obs_month,
        greatest((YEAR(a.observation_year_month) - YEAR(g.loss_date)) * 12 + MONTH(a.observation_year_month) - MONTH(g.loss_date) + 1,1 ) AS dev_month,
        p.ClaimType as claim_type,
        g.ANZO_Super_Class AS product_group,
        CASE WHEN coalesce(g.cell_name, 'Direct') = 'Householders' THEN 'Direct' else coalesce(g.cell_name, 'Direct') END AS channel,
        PremClass as premcls,
        SUM(a.new_claims_count) AS claim_count, 
        SUM(a.net_claims_incurred_movement_amount_gst_excl) AS net_claim_incurred,
        SUM(a.gross_claims_incurred_movement_amount_gst_excl) as gross_claim_incurred,
        SUM(a.claim_recoveries_movement_amount_gst_excl) as recoveries
    FROM 
        cds_prd.cds.claim_claim_transactionmonth_financialcounts a 
    LEFT JOIN
        cds_prd.rds.claim_claim_transactiondaily_financialcounts_detail g on a.claim_fkey = g.claim_origin_key
    LEFT JOIN 
        ids_prd.ref.ref_cause_of_loss c ON a.cause_of_loss_fkey = c.origin_key
    LEFT JOIN 
        (select distinct PolicyNumber,ReferenceProductCode_Ext FROM staging_prd.gw.pc_policyperiod) h ON g.Policy_Number = h.PolicyNumber
    LEFT JOIN
        actuarial_onprem_sqlserver.dbo.dim_hh_claim_premcls_18072025 p ON g.Claim_Number = RIGHT(p.CLAIMNO, LEN(p.CLAIMNO) - 1)
    WHERE 
        YEAR(g.loss_date) >= 2017
        AND g.anzo_super_class = '{product}'
        AND a.observation_year_month <= '{latest_balance_date_str}'
        AND g.loss_date <= '{latest_balance_date_str}'
        AND c.incident_description is not null
    GROUP BY 
        all
    ORDER BY 
        all
    """).toPandas()

### 2.2.3 - Exposure Data

In [0]:
expo_data = spark.sql(f"""
    select
        DATE_TRUNC('MM', exp_start) AS acc_month,
        CASE 
            WHEN COALESCE(channel, 'Direct') = 'ANZ' THEN 'ANZ'
            WHEN COALESCE(channel, 'Direct') = 'BD' THEN 'Broker Distribution'
            WHEN COALESCE(channel, 'Direct') = 'ELDERS' THEN 'Elders'
            WHEN COALESCE(channel, 'Direct') = 'FIOTHER' THEN 'FI Other'
            WHEN COALESCE(channel, 'Direct') IN ('DIRECT', 'AUSPOST', 'KOGAN', 'Direct') THEN 'Direct'
            ELSE COALESCE(channel, 'Direct')
        END AS channel,
        PREM_CLASS as premcls,
        sum(EARNPREM) as earnprem,
        sum(EXPOSURE) as exposure,
        1 as dev_month
    from actuarial_onprem_sqlserver.dbo.fact_hh_prem_premcls
    where exp_start >= '2017-01-01' AND exp_start <= '{latest_balance_date_str}'
    group by all
""").toPandas()

### 2.2.4 - GWP Data

In [0]:
gwp_data = spark.sql(f"""SELECT COUNT(DISTINCT PolicyNumber) AS policy_count,
  COUNT(*) AS risk_count,
  SUM(`Premium Excl. GST`) as GWP,
  CASE 
      WHEN COALESCE(channel, 'Direct') = 'ANZ' THEN 'ANZ'
      WHEN COALESCE(channel, 'Direct') = 'BD' THEN 'Broker Distribution'
      WHEN COALESCE(channel, 'Direct') = 'ELDERS' THEN 'Elders'
      WHEN COALESCE(channel, 'Direct') IN ('FI', 'FIOTHER') THEN 'FI Other'
      WHEN COALESCE(channel, 'Direct') IN ('DIRECT', 'Direct', 'Direct - Omni') THEN 'Direct'
      ELSE COALESCE(channel, 'Direct')
  END AS channel,
  `Policy Inception Quarter` AS YearQuarter
FROM actuarial_onprem_sqlserver.dbo.HH_Portfolio_Growth
WHERE flag_quote = 1
GROUP BY ALL
ORDER BY ALL
""").toPandas()

### 2.2.5 - Collection

In [0]:
# Attach and clean for earnprem and exposure data
data = pd.merge(
    claim_data_for_adequacy_analysis,
    expo_data,
    on=['acc_month', 'dev_month', 'channel'],
    how='left'
)

data[['earnprem', 'exposure']] = data[['earnprem', 'exposure']].fillna(0)

# Filter data input
model_data = data[
    (data['dev_month'] <= development_term) &
    (data['acc_month'] >= acc_month_start)
]

In [0]:
print(data.columns)

In [0]:
update_product_config(product_configs, product, input_product_short=product_short, input_claim_data=claim_data_for_adequacy_analysis, input_expo_data=expo_data, input_gwp_data=gwp_data, input_model_data=model_data, input_main_level=main_level, input_sub_levels=sub_levels)
print_product_configs_summary(product_configs)

# 3. Model Template

## 3.1 - Product Configurations

In [0]:
# product_configs.pop("Private Motor")
print_product_configs_summary(product_configs)

## 3.2 - Model Loop

In [0]:
# %run "./(Bill) IBNR Modelling Template (Standardised)"

# 4. Diagnostics

In [0]:
# # Read the HTML file content
# with open("/Workspace/Shared/General/IBNR project/ibnr_modelling/temp_output.html", "r") as f:
#     html_content = f.read()

# # Display it in the notebook
# displayHTML(html_content)

# 5. Application Input

In [0]:
# product_configs.items()

In [0]:
# app_config_rows = [
#     {
#         "Product": product,
#         "Table": f"actuaries_prd.general.{config.get('product_short', '').lower()}_ultimates_new",
#         "Main_Level": config.get('main_level', ''),
#         "Sub_Levels": config.get('sub_levels', '')
#     }
#     for product, config in product_configs.items()
# ]

# spark.createDataFrame(app_config_rows) \
#     .write \
#     .format("delta") \
#     .mode("overwrite") \
#     .option("overwriteSchema", "true") \
#     .saveAsTable("actuaries_prd.general.ibnr_product_configs")

# TEST SECTION

In [0]:
def build_models_new(input_data, valuation_date, development_period_end=24, model_triangle_groups=['channel', 'claim_type'], output_triangle_groups=None):
    """
    Parameters:
    - input_data: Combined claim and exposure data
    - valuation_date: Valuation date
    - development_period_end: Period where development is assumed complete
    - model_triangle_groups: Grouping for building the models (less granular)
    - output_triangle_groups: Grouping for final output (more granular, optional)
    
    Returns:
    - DataFrame with results at the output granularity level
    """
    # Default if output groups are not specified
    if output_triangle_groups is None:
        output_triangle_groups = model_triangle_groups

    # Suppress all warnings
    warnings.filterwarnings("ignore")

    # 1. Data Processing
    # =================================

    data_hidden = input_data[
        (input_data['obs_month'] <= valuation_date) &
        (input_data['acc_month'] <= valuation_date)
    ] 

    # Aggregate to model granularity for triangle building
    model_data = data_hidden.groupby(
        model_triangle_groups + ['acc_month', 'obs_month'], 
        as_index=False
    ).agg({
        'claim_count': 'sum',
        'net_claim_incurred': 'sum',
        'gross_claim_incurred': 'sum',
        'net_claim_incurred_indexed': 'sum',
        'gross_claim_incurred_indexed': 'sum',
        'recoveries_indexed': 'sum',
        'earnprem_indexed': 'sum',
        'exposure': 'sum',
        'recoveries': 'sum',
        'earnprem': 'sum'
    })
    
    # Create triangle for model building
    triangle_combined = cl.Triangle(
        model_data,
        origin="acc_month",
        development="obs_month",
        columns=[
            'claim_count',
            'net_claim_incurred',
            'gross_claim_incurred',
            'net_claim_incurred_indexed',
            'gross_claim_incurred_indexed',
            'recoveries_indexed',
            'earnprem_indexed',
            'exposure',
            'recoveries',
            'earnprem'
        ],
        index=model_triangle_groups,
        cumulative=False
    ).incr_to_cum()

    # Transformations
    triangle_combined['frequency'] = triangle_combined['claim_count'] / triangle_combined['exposure']
    triangle_combined['gross_cost_per_policy'] = triangle_combined['gross_claim_incurred'] / triangle_combined['exposure']
    triangle_combined['gross_cost_per_policy_indexed'] = triangle_combined['gross_claim_incurred_indexed'] / triangle_combined['exposure']
    triangle_combined['net_cost_per_policy'] = triangle_combined['net_claim_incurred'] / triangle_combined['exposure']
    triangle_combined['net_cost_per_policy_indexed'] = triangle_combined['net_claim_incurred_indexed'] / triangle_combined['exposure']
    triangle_combined['net_loss_ratio'] = triangle_combined['net_claim_incurred'] / triangle_combined['earnprem']
    triangle_combined['net_loss_ratio_indexed'] = triangle_combined['net_claim_incurred_indexed'] / triangle_combined['earnprem_indexed']

    # 2.1 Model Training - Claim Count
    # =================================

    claim_count_development_factors = cl.Development(n_periods=development_period_end).fit_transform(
        triangle_combined[['claim_count','frequency']]
    )
    for i in range(0, claim_count_development_factors.ldf_.values.shape[0]):
        claim_count_development_factors.ldf_.values[i][:,0,development_period_end:] = 1
        claim_count_development_factors.cdf_.values[i][:,0,development_period_end:] = 1

    # Chainladder
    claim_count_chainladder = cl.Chainladder().fit(claim_count_development_factors)
    
    weights = triangle_combined['exposure'].latest_diagonal
    weights /= np.sum(weights, axis=2, keepdims=True)
    apriori_claim_count = np.sum(
        (claim_count_chainladder.ultimate_['frequency'] * weights).iloc[:, :, -12:, :], 
        axis=2, keepdims=True
    ) / np.sum(weights.iloc[:, :, -12:, :], axis=2, keepdims=True)

    # Bornhuetter-Ferguson
    claim_count_bf = cl.BornhuetterFerguson(
        apriori=apriori_claim_count
    ).fit(
        triangle_combined[['claim_count','exposure']], 
        sample_weight=triangle_combined['exposure'].latest_diagonal
    )

    # Cape Cod
    claim_count_cc = cl.CapeCod().fit(
        triangle_combined[['claim_count','exposure']],
        sample_weight=triangle_combined['exposure'].latest_diagonal
    )


    # 2.2 Model Training - Net Incurred
    # =================================

    net_incurred_development_factors = cl.Development(n_periods=development_period_end).fit_transform(
        triangle_combined[['net_claim_incurred','net_cost_per_policy','net_loss_ratio']]
    )
    for i in range(0, net_incurred_development_factors.ldf_.values.shape[0]):
        net_incurred_development_factors.ldf_.values[i][:,0,development_period_end:] = 1
        net_incurred_development_factors.cdf_.values[i][:,0,development_period_end:] = 1

    # Chainladder
    net_incurred_chainladder = cl.Chainladder().fit(net_incurred_development_factors)
    
    weights = triangle_combined['earnprem'].latest_diagonal
    weights /= np.sum(weights, axis=2, keepdims=True)
    apriori_net_incurred = np.sum(
        (net_incurred_chainladder.ultimate_['net_loss_ratio'] * weights).iloc[:, :, -12:, :], 
        axis=2, keepdims=True
    ) / np.sum(weights.iloc[:, :, -12:, :], axis=2, keepdims=True)

    # Bornhuetter-Ferguson
    net_incurred_bf = cl.BornhuetterFerguson(
        apriori=apriori_net_incurred
    ).fit(
        net_incurred_development_factors, 
        sample_weight=triangle_combined['earnprem'].latest_diagonal
    )

    # Cape Cod
    net_incurred_cc = cl.CapeCod().fit(
        net_incurred_development_factors,
        sample_weight=triangle_combined['earnprem'].latest_diagonal
    )

    # 3. Apply models to output granularity
    # =================================

    return apply_models_to_granularity(
        input_data, valuation_date,
        model_triangle_groups, output_triangle_groups,
        claim_count_chainladder, claim_count_bf, claim_count_cc,
        net_incurred_chainladder, net_incurred_bf, net_incurred_cc
    )

In [0]:
def extract_model_results(claim_count_chainladder, claim_count_bf, claim_count_cc,
                         net_incurred_chainladder, net_incurred_bf, net_incurred_cc,
                         model_triangle_groups, debug=False):
    """Extract and format model results"""
   
    results = []
   
    # Claim count results
    for model, suffix in [(claim_count_chainladder, '_chainladder'),
                         (claim_count_bf, '_bf'),
                         (claim_count_cc, '_cc')]:
        df = model.ultimate_.to_frame().reset_index()
       
        if debug:
            print(f"\nProcessing claim count model{suffix}")
            print(f"Columns before processing: {df.columns.tolist()}")
            print(f"DataFrame shape: {df.shape}")
       
        # Handle the origin column rename
        if 'origin' in df.columns:
            df = df.rename(columns={'origin': 'acc_month'})
       
        # Rename claim_count column
        if 'claim_count' in df.columns:
            df = df.rename(columns={'claim_count': f'ultimate_claim_count{suffix}'})
       
        # Drop valuation column if it exists
        if 'valuation' in df.columns:
            df = df.drop(columns=['valuation'])
       
        if debug:
            print(f"Columns after processing: {df.columns.tolist()}")
           
        results.append(df)
   
    # Net incurred results  
    for model, suffix in [(net_incurred_chainladder, '_chainladder'),
                         (net_incurred_bf, '_bf'),
                         (net_incurred_cc, '_cc')]:
        df = model.ultimate_.to_frame().reset_index()
       
        if debug:
            print(f"\nProcessing net incurred model{suffix}")
            print(f"Columns before processing: {df.columns.tolist()}")
            print(f"DataFrame shape: {df.shape}")
       
        # Handle the origin column rename
        if 'origin' in df.columns:
            df = df.rename(columns={'origin': 'acc_month'})
       
        # Drop valuation column if it exists
        if 'valuation' in df.columns:
            df = df.drop(columns=['valuation'])
       
        # Keep only the required columns
        cols_to_keep = model_triangle_groups + ['acc_month']
       
        # Add net_claim_incurred if it exists, otherwise look for alternatives
        if 'net_claim_incurred' in df.columns:
            cols_to_keep.append('net_claim_incurred')
            df = df[cols_to_keep].rename(columns={
                'net_claim_incurred': f'ultimate_net_incurred{suffix}'
            })
        else:
            # Handle case where column might have different name
            incurred_cols = [col for col in df.columns if 'incurred' in col.lower()]
            if incurred_cols:
                cols_to_keep.append(incurred_cols[0])
                df = df[cols_to_keep].rename(columns={
                    incurred_cols[0]: f'ultimate_net_incurred{suffix}'
                })
            else:
                print(f"Warning: No incurred column found for {suffix}")
                print(f"Available columns: {df.columns.tolist()}")
                continue
       
        if debug:
            print(f"Columns after processing: {df.columns.tolist()}")
            print(f"Final columns to keep: {cols_to_keep}")
       
        results.append(df)
   
    # Merge all results
    if not results:
        raise ValueError("No model results could be extracted")
   
    final_result = results[0]
    for i, df in enumerate(results[1:], 1):
        if debug:
            print(f"\nMerging result {i+1}")
            print(f"Current result columns: {final_result.columns.tolist()}")
            print(f"Merging with columns: {df.columns.tolist()}")
            print(f"Merge keys: {model_triangle_groups + ['acc_month']}")
       
        final_result = pd.merge(
            final_result, df,
            on=model_triangle_groups + ['acc_month'],
            how='outer'
        )
   
    return final_result

In [0]:
def apply_proportional_allocation(df, model_groups, output_groups):
    """
    Apply proportional allocation when output granularity is finer than model granularity
    """
    
    # Identify the additional grouping variables
    additional_groups = [col for col in output_groups if col not in model_groups]
    
    if not additional_groups:
        return df
    
    # Calculate proportions for allocation
    proportion_cols = ['exposure', 'earnprem', 'earnprem_indexed']
    
    for prop_col in proportion_cols:
        if prop_col in df.columns:
            # Calculate totals at model granularity
            totals = df.groupby(model_groups + ['acc_month'])[prop_col].sum().reset_index()
            totals.columns = model_groups + ['acc_month', f'{prop_col}_total']
            
            # Merge back to get proportions
            df = pd.merge(df, totals, on=model_groups + ['acc_month'], how='left')
            df[f'{prop_col}_proportion'] = df[prop_col] / df[f'{prop_col}_total']
            df[f'{prop_col}_proportion'] = df[f'{prop_col}_proportion'].fillna(0)
    
    # Apply proportional allocation to ultimate values
    ultimate_cols = [col for col in df.columns if col.startswith('ultimate_')]
    
    for col in ultimate_cols:
        if 'claim_count' in col:
            # Use exposure for claim count allocation
            if 'exposure_proportion' in df.columns:
                df[col] = df[col] * df['exposure_proportion']
        else:
            # Use earnprem for net incurred allocation
            if 'earnprem_proportion' in df.columns:
                df[col] = df[col] * df['earnprem_proportion']
    
    # Clean up temporary columns
    temp_cols = [col for col in df.columns if col.endswith('_total') or col.endswith('_proportion')]
    df = df.drop(columns=temp_cols)
    
    return df


In [0]:
def apply_models_to_granularity(input_data, valuation_date,
                               model_triangle_groups, output_triangle_groups,
                               claim_count_chainladder, claim_count_bf, claim_count_cc,
                               net_incurred_chainladder, net_incurred_bf, net_incurred_cc):
    """
    Apply models built at one granularity to output at another granularity
    """
   
    # Get model results at model granularity
    model_results = extract_model_results(
        claim_count_chainladder, claim_count_bf, claim_count_cc,
        net_incurred_chainladder, net_incurred_bf, net_incurred_cc,
        model_triangle_groups, debug=False  # Set to False once working
    )
   
    # Prepare base data at output granularity
    data_hidden = input_data[
        (input_data['obs_month'] <= valuation_date) &
        (input_data['acc_month'] <= valuation_date)
    ]
   
    output_base = data_hidden.groupby(
        output_triangle_groups + ['acc_month'],
        as_index=False
    ).agg({
        'exposure': 'sum',
        'earnprem': 'sum',
        'earnprem_indexed': 'sum',
        'claim_count': 'sum',
        'net_claim_incurred': 'sum',
        'gross_claim_incurred': 'sum',
        'recoveries': 'sum',
        'net_claim_incurred_indexed': 'sum',
        'gross_claim_incurred_indexed': 'sum',
        'recoveries_indexed': 'sum',
        'product_group': 'first'
    }).rename(columns={
        'claim_count': 'reported_to_date_claim_count',
        'net_claim_incurred': 'reported_to_date_net_claim_incurred',
        'gross_claim_incurred': 'reported_to_date_gross_claim_incurred',
        'recoveries': 'reported_to_date_recoveries',
        'net_claim_incurred_indexed': 'reported_to_date_net_claim_incurred_indexed',
        'gross_claim_incurred_indexed': 'reported_to_date_gross_claim_incurred_indexed',
        'recoveries_indexed': 'reported_to_date_recoveries_indexed'
    })
   
    # Merge with model results
    output_results = pd.merge(
        output_base,
        model_results,
        on=model_triangle_groups + ['acc_month'],
        how='left'
    )
   
    # Apply proportional allocation if output is more granular
    if set(output_triangle_groups) > set(model_triangle_groups):
        output_results = apply_proportional_allocation(output_results, model_triangle_groups, output_triangle_groups)
   
    # Add full triangle results for comparison
    data_full = input_data.copy()
    full_results = get_full_triangle_results(data_full, output_triangle_groups)
   
    output_results = pd.merge(
        output_results,
        full_results,
        on=output_triangle_groups + ['acc_month'],
        how='left'
    )
   
    output_results['valuation_date'] = pd.to_datetime(valuation_date)
   
    return output_results


In [0]:
def get_full_triangle_results(data_full, triangle_groups):
    """Get actual results from full triangle for comparison"""
    
    triangle_full = cl.Triangle(
        data_full,
        origin='acc_month',
        development='obs_month',
        columns=[
            'claim_count',
            'net_claim_incurred',
            'gross_claim_incurred',
            'net_claim_incurred_indexed',
            'gross_claim_incurred_indexed',
            'recoveries_indexed',
        ],
        index=triangle_groups,
        cumulative=False,
    ).incr_to_cum()

    return triangle_full.latest_diagonal.to_frame().reset_index().drop(
        columns=['valuation']
    ).rename(columns={
        'origin': 'acc_month',
        'claim_count': 'latest_view_claim_count',
        'net_claim_incurred': 'latest_view_net_claim_incurred',
        'gross_claim_incurred': 'latest_view_gross_claim_incurred',
        'net_claim_incurred_indexed': 'latest_view_net_claim_incurred_indexed',
        'gross_claim_incurred_indexed': 'latest_view_gross_claim_incurred_indexed',
        'recoveries_indexed': 'latest_view_recoveries_indexed'
    })

In [0]:
def prepare_data(claim_data, exposure_data):
    # Ensure date columns are datetime
    claim_data['acc_month'] = pd.to_datetime(claim_data['acc_month'])
    claim_data['obs_month'] = pd.to_datetime(claim_data['obs_month'])
    exposure_data['acc_month'] = pd.to_datetime(exposure_data['acc_month'])
    
    # Determine merge keys based on available columns
    claim_cols = set(claim_data.columns)
    exposure_cols = set(exposure_data.columns)
    
    # Common columns for merging (excluding metrics)
    merge_keys = ['acc_month', 'dev_month']
    potential_keys = ['channel', 'claim_type', 'premcls', 'usage'] # Adjust for all possible column names
    
    for key in potential_keys:
        if key in claim_cols and key in exposure_cols:
            merge_keys.append(key)
    
    print(f"Merging on keys: {merge_keys}")
    
    # Merge claim and exposure data
    combined_data = pd.merge(
        claim_data,
        exposure_data,
        on=merge_keys,
        how='left'
    )
    
    # Fill missing exposure values with 0 (or handle as appropriate)
    combined_data['exposure'] = combined_data['exposure'].fillna(0)
    combined_data['earnprem'] = combined_data['earnprem'].fillna(0)
    
    return combined_data

In [0]:
def parallel_runs(input_data, valuation_dates, development_period_end, model_triangle_groups=['channel', 'claim_type'], output_triangle_groups=['channel', 'claim_type', 'premcls']):
    # Run in parallel
    valuation_data = Parallel(n_jobs=-1)(delayed(
        build_models_new
    )(
        input_data,
        date,
        development_period_end,
        model_triangle_groups, # Build models at this level
        output_triangle_groups  # Apply at this level
    ) for date in tqdm(valuation_dates, desc="Processing valuation dates"))

    # Combine results into a single DataFrame
    result = pd.concat(valuation_data, ignore_index=True)
    
    return result

    combined_df = parallel_runs(input_data=curr_indexed_model_data
                                , valuation_dates=valuation_dates
                                , development_period_end=development_term
                                , model_triangle_groups=['channel', 'claim_type']
                                , output_triangle_groups=['channel', 'claim_type', 'premcls']) # parameterise


In [0]:
def indexation(data, cpi, indexation_date, join_column, indexation_columns, suffix='_indexed', unindex=False):
    """
    Apply indexation (e.g., CPI adjustment) to specified columns in a dataset based on a reference date.

    Parameters:
    - data (pd.DataFrame): Input DataFrame containing the data to be indexed.
    - indexation_date (str or pd.Timestamp): Date used as the reference for indexation (e.g., valuation date).
    - join_column (str): Column in `data` representing time (e.g., transaction date) to merge with CPI data.
    - indexation_columns (list): List of column names in `data` to apply indexation to.
    - suffix (str, optional): Suffix to append to indexed column names. Defaults to '_indexed'.
    - unindex (bool, optional): If True, reverses indexation (divides by factor); if False, applies it (multiplies).
                                Defaults to False.

    Returns:
    - pd.DataFrame: DataFrame with new indexed columns and temporary columns removed.

    Raises:
    - ValueError: If columns with the `suffix` already exist in `data`.
    - Warning: If `unindex=True` and `suffix='_indexed'` (to avoid naming confusion).
    """

    # Create new column names by appending the suffix to the original column names
    indexation_columns1 = [col + suffix for col in indexation_columns]
    
    # Check if any of the new column names already exist in the DataFrame
    for col in indexation_columns1:
        if col in data.columns:
            raise ValueError(f"Column {col} already exists in data.")

    # Create a copy of the input DataFrame to avoid modifying the original
    data = data.copy()
    
    # Add a column for the indexation reference date (converted to datetime)
    data['cpi_valuation_quarter'] = pd.to_datetime(indexation_date)

    # Merge the data with CPI data based on the join_column (e.g., transaction date)
    # Uses merge_asof to match to the nearest CPI quarter
    data = pd.merge_asof(
        data.sort_values(join_column),              # Sort data by join_column (e.g., transaction date)
        cpi.sort_values('quarter'),                 # Sort CPI data by quarter (assumes 'cpi' is a global DataFrame)
        left_on=join_column,                        # Column in `data` to match
        right_on='quarter',                         # Column in `cpi` to match
        direction='nearest'                         # Match to the nearest quarter
    ).drop(columns=['quarter']).rename(columns={'cpi': 'cpi_txn'})            # Remove the redundant 'quarter' column from CPI, # Rename CPI column to indicate transaction CPI

    # Merge again to get CPI for the indexation_date (valuation date)
    data = pd.merge_asof(
        data.sort_values('cpi_valuation_quarter'),  # Sort by the valuation date column
        cpi.sort_values('quarter'),                 # Sort CPI data by quarter
        left_on='cpi_valuation_quarter',            # Match on valuation date
        right_on='quarter',                         # Match on CPI quarter
        direction='nearest'                         # Match to the nearest quarter
    ).drop(columns=['quarter']).rename(columns={'cpi': 'cpi_valuation'})  # Remove the redundant 'quarter' column  # Rename CPI column to indicate valuation CPI

    # Calculate the indexation factor based on whether we're indexing or unindexing
    if unindex:
        # If unindexing, divide transaction CPI by valuation CPI (reverse adjustment)
        data['indexation_factor'] = data['cpi_txn'] / data['cpi_valuation']
        # Warn if the default suffix '_indexed' is used with unindexing
        if suffix == '_indexed':
            raise Warning("Unindexing is enabled. Please change the suffix from '_indexed' to avoid confusion")
    else:
        # If indexing, divide valuation CPI by transaction CPI (standard adjustment)
        data['indexation_factor'] = data['cpi_valuation'] / data['cpi_txn']

    # Drop temporary columns used for calculation
    data.drop(columns=['cpi_txn', 'cpi_valuation', 'cpi_valuation_quarter'], inplace=True)

    # Apply the indexation factor to the specified columns and create new columns with the suffix
    data[indexation_columns1] = data[indexation_columns].multiply(data['indexation_factor'], axis=0)
    
    # Remove the indexation_factor column as it's no longer needed
    data.drop(columns=['indexation_factor'], inplace=True)

    # Return the modified DataFrame
    return data

In [0]:
def reshape_forecast_output(
    df: pd.DataFrame,
    response_prefixes: dict = {
        'ultimate_claim_count': {
            'response_type': 'count',
            'actual_col': 'latest_view_claim_count',
            'reported_col': 'reported_to_date_claim_count'
        },
        'ultimate_net_incurred': {
            'response_type': 'net_incurred',
            'actual_col': 'latest_view_net_claim_incurred',
            'reported_col': 'reported_to_date_net_claim_incurred'
        }
    },
    id_columns: list = ['acc_month', 'valuation_date', 'channel', 'claim_type']
) -> pd.DataFrame:
    """
    Reshapes ultimate model output DataFrame into long format with actual and predicted values.
    Handles both claim count and net incurred amounts.

    Parameters:
    - df: Input DataFrame.
    - response_prefixes: Dictionary mapping response prefixes to their metadata including:
        - response_type: Label for type of response ('count', 'net_incurred', etc.)
        - actual_col: Column name representing actual value
        - reported_col: Column name representing reported-to-date value
    - id_columns: List of identifying columns to retain (e.g. ['acc_month', 'valuation_date', ...]).

    Returns:
    - A tidy DataFrame with columns: id_columns + ['model', 'actual', 'predicted', 'latest_view_*', 'reported_to_date_*', 'response']
    """
    result_dfs = []

    # Process each response prefix type
    for response_prefix, config in response_prefixes.items():
        response_type = config['response_type']
        actual_col = config['actual_col']
        reported_col = config['reported_col']
        
        # Skip if required columns are not in the DataFrame
        if actual_col not in df.columns or reported_col not in df.columns:
            continue
        
        # Identify ultimate columns for the given response prefix
        ultimate_cols = [col for col in df.columns if col.startswith(response_prefix)]
        
        if not ultimate_cols:
            continue
            
        models = [col.split('_')[-1] for col in ultimate_cols]

        for model, col_name in zip(models, ultimate_cols):
            # Make sure all required columns exist before proceeding
            required_cols = id_columns + [col_name, actual_col, reported_col]
            if not all(col in df.columns for col in required_cols):
                continue
                
            model_df = df[required_cols].copy()

            model_df['model'] = model
            model_df['actual'] = model_df[actual_col] - model_df[reported_col]
            model_df['predicted'] = model_df[col_name] - model_df[reported_col]
            model_df['response'] = response_type

            # Rename columns to standardised names to ensure they're consistent
            model_df = model_df.rename(columns={
                actual_col: 'latest_view',
                reported_col: 'reported_to_date'
            })

            # Drop the original ultimate column
            model_df = model_df.drop(columns=[col_name])
            result_dfs.append(model_df)

    # If no results were found, return empty DataFrame with correct columns
    if not result_dfs:
        return pd.DataFrame(columns=id_columns + ['model', 'actual', 'predicted', 'latest_view', 'reported_to_date', 'response'])
    
    # Combine all results
    result = pd.concat(result_dfs, ignore_index=True)
    
    # Ensure consistent column order
    result = result[id_columns + ['model', 'actual', 'predicted', 'latest_view', 'reported_to_date', 'response']]

    return result

In [0]:
def calculate_mae_excl_last_6_months(data, actual_col, pred_cols, groupby_cols, date_col='valuation_date'):
    results = []

    for keys, group in data.groupby(groupby_cols):
        # Ensure date is datetime
        group[date_col] = pd.to_datetime(group[date_col])

        # Aggregate actual and predicted per valuation_date
        agg_list = {'valuation_date': group[date_col].unique()}
        agg_df = pd.DataFrame({'valuation_date': group[date_col]})
        agg_df[actual_col] = group[actual_col]
        for col in pred_cols:
            agg_df[col] = group[col]

        agg_df = agg_df.groupby('valuation_date').sum().reset_index()

        # Define cutoff date
        cutoff_date = agg_df['valuation_date'].max() - pd.DateOffset(months=6)
        filtered_df = agg_df[agg_df['valuation_date'] <= cutoff_date]

        # Compute MAE per model
        mae_dict = {}
        for col in pred_cols:
            mae = np.abs(filtered_df[actual_col] - filtered_df[col]).mean()
            mae_dict[col] = mae

        if not mae_dict:
            continue  # Skip if no models were evaluated

        # Find best model
        best_model = min(mae_dict, key=mae_dict.get)

        # Save results
        results.append({
            **dict(zip(groupby_cols, keys if isinstance(keys, tuple) else [keys])),
            **{f'mae_{col}': val for col, val in mae_dict.items()},
            'best_model': best_model
        })

    return pd.DataFrame(results)


In [0]:
def get_best_method(input_df, metric, groupby_cols):

    if metric == 'claim_count':
        actual_col = 'latest_view_claim_count'
    else:
        actual_col = 'latest_view_net_claim_incurred'

    best_method = calculate_mae_excl_last_6_months(
        input_df,
        actual_col=actual_col,
        pred_cols=[
            'ultimate_' + metric + '_chainladder',
            'ultimate_' + metric + '_bf',
            'ultimate_' + metric + '_cc'
        ],
        groupby_cols=groupby_cols,
        date_col='valuation_date'
    )

    if metric == 'claim_count':
        best_method['response'] = 'count'
    else:
        best_method['response'] = 'net_incurred'

    best_method['model'] = best_method['best_model'].str.split("_").str[-1]

    return best_method

In [0]:
for i, product in enumerate(product_configs, start=1):
    print(f"\n=== [{i}/{len(product_configs)}] Processing Product: {product} ===")

    # 1. Setup
    # =================================
    print("     [1/8] Retrieving Configurations")
    curr_product_short = product_configs[product]["product_short"]
    curr_claim_data = product_configs[product]["claim_data"]
    curr_expo_data = product_configs[product]["expo_data"]
    curr_model_data = product_configs[product]["model_data"]
    # curr_aggLevels = [product_configs[product]["main_level"]] + product_configs[product]["sub_levels"]
    curr_aggLevels = ['channel', 'claim_type', 'premcls']

    # 2. Indexation
    # =================================
    print("     [2/8] Column Indexation")
    combined_data_adequacy = prepare_data(claim_data_for_adequacy_analysis, expo_data)
    cols_to_index = ['earnprem', 'net_claim_incurred', 'gross_claim_incurred', 'recoveries']
    temp_df1 = combined_data_adequacy.copy()
    temp_df1[cols_to_index] = temp_df1[cols_to_index].astype(float)
    
    # Read cpi file and format dataframe
    cpi_df = pd.read_csv(cpi_file_path)
    cpi_df['quarter'] = pd.to_datetime(cpi_df['quarter'], format="%d/%m/%Y")

    curr_indexed_model_data = indexation(temp_df1, cpi=cpi_df, indexation_date=last_day_previous_month, join_column='obs_month', indexation_columns=cols_to_index) 

    # 3. Modelling
    # =================================
    print("     [3/8] Modelling")
    combined_df = parallel_runs(input_data=curr_indexed_model_data
                                , valuation_dates=valuation_dates
                                , development_period_end=development_term
                                , model_triangle_groups=['channel', 'claim_type']
                                , output_triangle_groups=['channel', 'claim_type', 'premcls']) # parameterise

    # 4. Reshape Output
    # =================================
    print("     [4/8] Reshaping Output")
    result = reshape_forecast_output(
        df=combined_df,
        response_prefixes={
            'ultimate_claim_count': {
                'response_type': 'count',
                'actual_col': 'latest_view_claim_count',
                'reported_col': 'reported_to_date_claim_count'
            },
            'ultimate_net_incurred': {
                'response_type': 'net_incurred',
                'actual_col': 'latest_view_net_claim_incurred',
                'reported_col': 'reported_to_date_net_claim_incurred'
            }
        },
        id_columns=['acc_month', 'valuation_date'] + curr_aggLevels
    )

    # 5. Best Method Selection
    # =================================
    print("     [5/8] Best Method Selection")
    claim_count_best_method = get_best_method(combined_df, 'claim_count', curr_aggLevels)
    net_incurred_best_method = get_best_method(combined_df, 'net_incurred', curr_aggLevels)

    # Union best method dataframes to be used for subsetting
    best_method_union = pd.concat([claim_count_best_method, net_incurred_best_method]).drop_duplicates().reset_index(drop=True)

    # Get best model results
    bm_results = result.merge(
        best_method_union[curr_aggLevels + ['model', 'response']],
        on=curr_aggLevels + ['model', 'response'],
        how='inner'
    )

    # Retrieve latest valuation date for selected models

    bm_results = bm_results[bm_results['valuation_date'] == bm_results['valuation_date'].max()]

    

In [0]:
bm_results

In [0]:
# 6. Transformations
# =================================
print("     [6/8] Transformations")
# Consolidation
ultimates_pre = pd.merge(
    curr_claim_data.groupby(['acc_month'] + curr_aggLevels)[['claim_count', 'net_claim_incurred']].sum().reset_index(),
    bm_results,
    on=['acc_month'] + curr_aggLevels,
    how='outer'
)

# Merge with expo_data
ultimates_pre = pd.merge(ultimates_pre, curr_expo_data, on=['acc_month', 'channel', 'premcls']) # - TEMPORARY, USE CHANNEL ONLY, expo data should match granularity of aggregation

# Fill missing values and convert to int64 for relevant columns
columns_to_fix = ['net_claim_incurred', 'claim_count', 'predicted',  'exposure']
ultimates_pre[columns_to_fix] = ultimates_pre[columns_to_fix].fillna(0).astype('int64')
ultimates_pre['product'] = product

# Pivoting
pivot_indexes = ['acc_month', 'product', 'claim_count', 'net_claim_incurred', 'earnprem', 'exposure'] + curr_aggLevels

# Pivoting predicted values
predicted_pivot = ultimates_pre.pivot_table(index=pivot_indexes, 
                                columns='response', 
                                values='predicted').reset_index()
predicted_pivot.columns.name = None
predicted_pivot = predicted_pivot.rename(columns={
    'count': 'ibnr_count',
    'net_incurred': 'ibnr_incurred'
})

# Pivoting model values
model_pivot = ultimates_pre.pivot_table(index=pivot_indexes, 
                            columns='response', 
                            values='model', aggfunc='first').reset_index()
model_pivot.columns.name = None
model_pivot = model_pivot.rename(columns={
    'count': 'count_model',
    'net_incurred': 'incurred_model'
})

# Merge the two pivoted DataFrames
ultimates_df = pd.merge(predicted_pivot, model_pivot, on=pivot_indexes)

# Create ultimate count and ultimate incurred columns
ultimates_df['ultimate_count'] = ultimates_df['claim_count'] + ultimates_df['ibnr_count']
ultimates_df['ultimate_incurred'] = ultimates_df['net_claim_incurred'] + ultimates_df['ibnr_incurred']


In [0]:
display(ultimates_df)

In [0]:



# 7. Inflation Adjustment
# =================================
print("     [7/8] Inflation Adjustment")
temp_df2 = ultimates_df.copy()

# Add Quarter to Ultimates Table
temp_df2['quarter'] = temp_df2['acc_month'].dt.to_period('Q').astype(str).str.replace('Q', 'Q', regex=False)

# CPI
ultimates_with_cpi = temp_df2.merge(cpi_by_quarter[['quarter', 'cpi']], on='quarter', how='left')

# Base CPI
base_cpi_quarter = sorted(ultimates_with_cpi['quarter'].unique())[-2] # Get second-last quarter in dataset
base_cpi_value = cpi_by_quarter.loc[cpi_by_quarter['quarter'] == base_cpi_quarter, 'cpi'].values[0]
ultimates_with_cpi['base_cpi'] = base_cpi_value

# Index Multiplier
ultimates_with_cpi['index_multiplier'] = ultimates_with_cpi['base_cpi'] / ultimates_with_cpi['cpi']

# Adjusted Incurreds
ultimates_with_cpi['adj_ultimate_incurred'] = ultimates_with_cpi['ultimate_incurred'] * ultimates_with_cpi['index_multiplier']
ultimates_with_cpi['adj_net_claim_incurred'] = ultimates_with_cpi['net_claim_incurred'] * ultimates_with_cpi['index_multiplier']

    # # 8. Output to Table
    # # =================================
    # print("     [8/8] Writing to databricks table")
    # spark.createDataFrame(ultimates_with_cpi) \
    #     .write \
    #     .format("delta") \
    #     .mode("overwrite") \
    #     .option("overwriteSchema", "true") \
    #     .saveAsTable("actuaries_prd.general." + curr_product_short.lower() + "_ultimates_new")

    # print(f"         - Saved to: actuaries_prd.general.{curr_product_short.lower()}_ultimates_new")

In [0]:
display(ultimates_with_cpi)