In [None]:
from rich import print as rprint
import arff
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import main

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
pd.set_option("display.precision", 2)


## Explorative Datenanalyse

-   Der zweite Vertrag hat sechs Verträge mehr als der erste Vertrag.


In [None]:
df = pd.read_csv("ins_claims.csv", index_col=0)

rprint(df.info())


-   Beschreibung der Variablen


In [None]:
df.describe(include=["float64", "object"])


-   Korrelations zwischen unabhängigen Variablen


In [None]:
df[df["ClaimAmount"] > 0].corr(numeric_only=True)


### ClaimAmount und ClaimAmount/Exposure

-   Viele Nullen für `ClaimAmount`


In [None]:
rprint(sum(df["ClaimAmount"] > 0.0) / df.shape[0])
rprint(sum(df["ClaimNb"] > 0.0) / df.shape[0])


-   Outliers


In [None]:
# Remove zeros and outliers
up_lim = df["ClaimAmount"].quantile(0.995)

logical_vec = (df["ClaimAmount"] > 0.0) * (df["ClaimAmount"] < up_lim)

df_sev = df[logical_vec]

df_sev.insert(
    loc=0, column="ClaimExp", value=df_sev["ClaimAmount"] / df_sev["Exposure"]
)

# Plot
fig, axs = plt.subplots(2, 2, figsize=(10, 10))

# Plot ClaimAmount
df_sev.plot.scatter(
    x="IDpol",
    y="ClaimAmount",
    marker=".",
    alpha=0.1,
    linewidth=0.5,
    ax=axs[0, 0],
)

df_sev.hist(column="ClaimAmount", bins=50, ax=axs[0, 1])

# Plot Claim per Exposure Year
df_sev.plot.scatter(
    x="IDpol", y="ClaimExp", marker=".", alpha=0.1, linewidth=0.5, ax=axs[1, 0]
)

df_sev.hist(column="ClaimExp", bins=50, ax=axs[1, 1])


-   Scatter plot


### ClaimNb und Density


In [None]:
fig, axs = plt.subplots(2, 2, figsize=(10, 10))

# Vehicle Brand
df.hist(column="ClaimNb", bins=40, ax=axs[0, 0])
df.plot.scatter(
    x="IDpol", y="ClaimNb", marker=".", alpha=0.1, linewidth=0.5, ax=axs[0, 1]
)

df.hist(column="Density", bins=10, ax=axs[1, 0])
df.plot.scatter(x="IDpol", y="Density", marker=".", alpha=0.1, ax=axs[1, 1])


### DrivAge, VehAge, VehPower and BonusMalus


In [None]:
df.hist(
    column=["DrivAge", "VehAge", "VehPower", "BonusMalus"],
    bins=30,
    layout=(3, 2),
    figsize=(10, 10),
)


### Area and Region


In [None]:
fig, axs = plt.subplots(2, 2, figsize=(10, 10))

# Area
df[["Area"]].groupby("Area").size().plot(kind="bar", ax=axs[0, 0])

df[["Area", "ClaimAmount"]].groupby("Area").mean().plot(
    kind="bar", ax=axs[0, 1]
)

# Region
df[["Region"]].groupby("Region").size().plot(kind="bar", ax=axs[1, 0])

df[["Region", "ClaimAmount"]].groupby("Region").mean().plot(
    kind="bar", ax=axs[1, 1]
)


fig, (ax1, ax2) = plt.subplots(1, 2)

df.groupby("Area").size().plot(kind="bar", ax=ax1)

df[["Area", "ClaimAmount"]].groupby("Area").mean().plot(kind="bar", ax=ax2)


### Vehicle Brand and Vehicle Gas-Type


In [None]:
fig, axs = plt.subplots(2, 2, figsize=(10, 10))

# Vehicle Brand
df[["VehBrand"]].groupby("VehBrand").size().plot(kind="bar", ax=axs[0, 0])

df[["VehBrand", "ClaimAmount"]].groupby("VehBrand").mean().plot(
    kind="bar", ax=axs[0, 1]
)

# Vehicle Gas-Type
df[["VehGas"]].groupby("VehGas").size().plot(kind="bar", ax=axs[1, 0])

df[["VehGas", "ClaimAmount"]].groupby("VehGas").mean().plot(
    kind="bar", ax=axs[1, 1]
)


## Feature engineering


-   `load_csv()` function in `main.py`


## Modellvergleich


-   `obj.score()` method
-   RMSE


## Modellbuilding


-   Generalized Linear Models (GLM)
-   Generalized Additive Models (GAM)
-   Tree based
-   Neural Networks

-   Yang, Qian, Zou (2018), namens "Insurance premium prediction via gradient tree-boosted Tweedie compound Poisson models"


##### Frequency-Severity Decomposition

-   Poisson regression für `ClaimNb` und Gamma regression für `ClaimAmount` pro Claim
-   Claim Frequency ist einfacher zu modellieren
-   Claim Severity ist schwierig
-   Man kann nicht kontrollieren wie viel Schaden es geben wird
-   "Overall, the modeling of claim amounts is more difficult than claim frequencies" Deinut, et. al. (2019), p18

##### Nulls

-   Tweedie regression
-   Erstens Classification, dann Modellierung
    -   Codes mit perfekter Klassifikation

##### Outliers

-   $p$ für Outliers and $1-p$ für nicht Outliers
