# Aggregation

In [1]:
import pandas as pd

# load data
data_klaim = pd.read_csv("/kaggle/input/datasets/fathinahnurjannah/mcf-dsc-itb-cleaned/data_klaim_cleaned.csv", parse_dates=["tanggal_pembayaran_klaim"])

# --- agregasi dasar ---
klaim_agg = (
    data_klaim
    .groupby("nomor_polis")
    .agg(
        total_klaim = ("nominal_klaim_yang_disetujui", "sum"),
        rata_rata_klaim = ("nominal_klaim_yang_disetujui", "mean"),
        frekuensi_klaim = ("nominal_klaim_yang_disetujui", "count"),
        terakhir_klaim = ("tanggal_pembayaran_klaim", "max"),
        perdana_klaim = ("tanggal_pembayaran_klaim", "min")
    )
    .reset_index()
)

cutoff_date = pd.Timestamp("2025-07-31")

klaim_agg["hari_sejak_klaim_terakhir"] = (
    cutoff_date - klaim_agg["terakhir_klaim"]
).dt.days


print(klaim_agg.head())

  nomor_polis   total_klaim  rata_rata_klaim  frekuensi_klaim terakhir_klaim  \
0    POL-0003  1.413816e+07     1.413816e+07                1     2024-10-31   
1    POL-0006  1.786125e+08     8.930625e+07                2     2024-12-13   
2    POL-0010  6.497528e+07     3.248764e+07                2     2024-02-27   
3    POL-0011  3.597431e+08     1.798715e+08                2     2025-08-27   
4    POL-0013  1.569392e+08     1.569392e+08                1     2025-07-24   

  perdana_klaim  hari_sejak_klaim_terakhir  
0    2024-10-31                        273  
1    2024-11-13                        230  
2    2024-02-27                        520  
3    2024-12-13                        -27  
4    2025-07-24                          7  


In [4]:
data_polis = pd.read_csv("/kaggle/input/datasets/fathinahnurjannah/mcf-dsc-itb-cleaned/data_polis_cleaned.csv")

df_model = data_polis.merge(
    klaim_agg,
    on="nomor_polis",
    how="left"
)

# isi NaN karena polis tanpa klaim
df_model[["total_klaim","rata_rata_klaim","frekuensi_klaim"]] = \
    df_model[["total_klaim","rata_rata_klaim","frekuensi_klaim"]].fillna(0) 