In [1]:
import pandas as pd

df_clean = pd.read_csv("../data/processed/claims_cleaned.csv")
df_clean.head()


Unnamed: 0,insured_id,date_of_birth,claim_start_date,age_at_claim,annual_salary,claim_duration_days,daily_benefit
0,A001,1978-04-12,2023-03-12,44.914442,45000,30.0,110
1,A002,1984-11-03,2023-02-05,38.255989,38000,15.0,90
2,A004,1992-07-21,2023-06-01,30.861054,52000,10.0,130
3,A006,1970-12-30,2023-09-10,52.695414,41000,40.0,105
4,A007,2005-01-01,2023-04-20,18.297057,22000,5.0,60


In [2]:
# Construction de la base “pricing” par assuré

pricing_base = (
    df_clean
    .groupby("insured_id")
    .agg(
        nb_claims=("claim_start_date", "count"),
        total_days=("claim_duration_days", "sum"),
        avg_daily_benefit=("daily_benefit", "mean"),
        annual_salary=("annual_salary", "first"),
        avg_age=("age_at_claim", "mean")
    )
    .reset_index()
)

pricing_base["exposure_years"] = 1.0  # simplification V1


Unnamed: 0,insured_id,nb_claims,total_days,avg_daily_benefit,annual_salary,avg_age,exposure_years
0,A001,1,30.0,110.0,45000,44.914442,1.0
1,A002,1,15.0,90.0,38000,38.255989,1.0
2,A004,1,10.0,130.0,52000,30.861054,1.0
3,A006,1,40.0,105.0,41000,52.695414,1.0
4,A007,1,5.0,60.0,22000,18.297057,1.0


In [3]:
# Calcul de la prime pure (ITT V1)

pricing_base["frequency"] = pricing_base["nb_claims"] / pricing_base["exposure_years"]
pricing_base["avg_duration"] = pricing_base["total_days"] / pricing_base["nb_claims"]
pricing_base["avg_claim_cost"] = pricing_base["avg_duration"] * pricing_base["avg_daily_benefit"]
pricing_base["pure_premium"] = pricing_base["frequency"] * pricing_base["avg_claim_cost"]

pricing_base[[
    "insured_id",
    "frequency",
    "avg_duration",
    "avg_daily_benefit",
    "avg_claim_cost",
    "pure_premium"
]]


Unnamed: 0,insured_id,frequency,avg_duration,avg_daily_benefit,avg_claim_cost,pure_premium
0,A001,1.0,30.0,110.0,3300.0,3300.0
1,A002,1.0,15.0,90.0,1350.0,1350.0
2,A004,1.0,10.0,130.0,1300.0,1300.0
3,A006,1.0,40.0,105.0,4200.0,4200.0
4,A007,1.0,5.0,60.0,300.0,300.0


In [5]:
# Sensitive checks
# Statistiques

pricing_base[[
    "frequency","avg_duration","avg_daily_benefit","avg_claim_cost","pure_premium"
]].describe()

Unnamed: 0,frequency,avg_duration,avg_daily_benefit,avg_claim_cost,pure_premium
count,5.0,5.0,5.0,5.0,5.0
mean,1.0,20.0,99.0,2090.0,2090.0
std,0.0,14.57738,26.07681,1604.057356,1604.057356
min,1.0,5.0,60.0,300.0,300.0
25%,1.0,10.0,90.0,1300.0,1300.0
50%,1.0,15.0,105.0,1350.0,1350.0
75%,1.0,30.0,110.0,3300.0,3300.0
max,1.0,40.0,130.0,4200.0,4200.0


In [6]:
# Sensitive checks
# Top primes (detection d'extremes)

pricing_base.sort_values("pure_premium", ascending=False).head(10)

Unnamed: 0,insured_id,nb_claims,total_days,avg_daily_benefit,annual_salary,avg_age,exposure_years,frequency,avg_duration,avg_claim_cost,pure_premium
3,A006,1,40.0,105.0,41000,52.695414,1.0,1.0,40.0,4200.0,4200.0
0,A001,1,30.0,110.0,45000,44.914442,1.0,1.0,30.0,3300.0,3300.0
1,A002,1,15.0,90.0,38000,38.255989,1.0,1.0,15.0,1350.0,1350.0
2,A004,1,10.0,130.0,52000,30.861054,1.0,1.0,10.0,1300.0,1300.0
4,A007,1,5.0,60.0,22000,18.297057,1.0,1.0,5.0,300.0,300.0


In [7]:
# Sensitive checks
# Verif divisions / valeurs invalides

checks = {
    "nb_claims_zero": int((pricing_base["nb_claims"] == 0).sum()),
    "pure_premium_negative": int((pricing_base["pure_premium"] < 0).sum()),
    "pure_premium_nan": int(pricing_base["pure_premium"].isna().sum()),
    "avg_duration_negative": int((pricing_base["avg_duration"] < 0).sum()),
    "avg_duration_nan": int(pricing_base["avg_duration"].isna().sum()),
}
checks



{'nb_claims_zero': 0,
 'pure_premium_negative': 0,
 'pure_premium_nan': 0,
 'avg_duration_negative': 0,
 'avg_duration_nan': 0}