In [28]:
# Problem set 3 analysis

import numpy as np
import pandas as pd
import data._sample_split as sample_split
from data._load_transform import *

import matplotlib.pyplot as plt
import scipy.optimize as optimize
import scipy.stats
from dask_ml.preprocessing import Categorizer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import ShuffleSplit
from glum import GeneralizedLinearRegressor
from glum import TweedieDistribution


In [23]:
from importlib import reload
reload(sample_split)

<module 'data._sample_split' from 'c:\\Users\\charl\\OneDrive\\Uni\\Masters\\Computing\\FDS\\ps3_claims\\ps3\\data\\_sample_split.py'>

In [24]:
# Load the data using the pre made function
df = load_transform()


We see that IDpol is a unique ID so will use that for the creating custom train test split.

In [25]:
print(df.head(5))
print(df.describe())
df.info()

   IDpol  ClaimNb  Exposure Area  VehPower  VehAge  DrivAge  BonusMalus  \
0      1        0      0.10    D         5       0        5          50   
1      3        0      0.77    D         5       0        5          50   
2      5        0      0.75    B         6       1        5          50   
3     10        0      0.09    B         7       0        4          50   
4     11        0      0.84    B         7       0        4          50   

  VehBrand   VehGas  Density Region  ClaimAmount  ClaimAmountCut  
0      B12  Regular     1217    R82          0.0             0.0  
1      B12  Regular     1217    R82          0.0             0.0  
2      B12   Diesel       54    R22          0.0             0.0  
3      B12   Diesel       76    R72          0.0             0.0  
4      B12   Diesel       76    R72          0.0             0.0  
              IDpol        ClaimNb       Exposure       VehPower  \
count  6.780130e+05  678013.000000  678013.000000  678013.000000   
mean   2.62

In [26]:
df = sample_split.create_sample_split(df, 'IDpol', 0.8)
df.head(5)

# Count number true for sample
training_count = df['sample'].value_counts()[0]
testing_count = df['sample'].value_counts()[1]



print(f'Train test split ratio: {training_count / (testing_count + training_count)}')
# 80% split as expected


Train test split ratio: 0.7999994100408104


  training_count = df['sample'].value_counts()[0]
  testing_count = df['sample'].value_counts()[1]


### Tweedie model for pure premium

Exposure - How long (in years) policy held
ClaimAmountCut - Total (cut) claim amount per policy

PurePremium - Cost to insurer per year of the policy. 

So to get Pure Premium need to divide claim amount by exposure

In [31]:
df.head()

Unnamed: 0,IDpol,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region,ClaimAmount,ClaimAmountCut,sample,PurePremium
0,1,0,0.1,D,5,0,5,50,B12,Regular,1217,R82,0.0,0.0,True,0.0
1,3,0,0.77,D,5,0,5,50,B12,Regular,1217,R82,0.0,0.0,True,0.0
2,5,0,0.75,B,6,1,5,50,B12,Diesel,54,R22,0.0,0.0,True,0.0
3,10,0,0.09,B,7,0,4,50,B12,Diesel,76,R72,0.0,0.0,False,0.0
4,11,0,0.84,B,7,0,4,50,B12,Diesel,76,R72,0.0,0.0,True,0.0


In [None]:
# Set up variables
weight = df['Exposure'].values
df["PurePremium"] = df["ClaimAmountCut"] / df["Exposure"]
y = df["PurePremium"]


categoricals = ["VehBrand", "VehGas", "Region", "Area", "DrivAge", "VehAge", "VehPower"]
predictors = categoricals + ["BonusMalus", "Density"]

In [None]:
# Set up the training and testing data

X_train = df[df["sample"] == True][predictors]
X_test = df[df["sample"] == False][predictors]

y_train = df[df["sample"] == True]["PurePremium"]
y_test = df[df["sample"] == False]["PurePremium"]

# Categorize the data - transforms columns to categorical
categorizer = Categorizer(columns=categoricals)

X_train = categorizer.fit_transform(X_train)
X_test = categorizer.transform(X_test)

weights_train = df[df["sample"] == True]["Exposure"]
weights_test = df[df["sample"] == False]["Exposure"]

In [41]:
# Fit tweedie model
TweedieDist = TweedieDistribution(power=1.5)
t_model = GeneralizedLinearRegressor(family=TweedieDist, alpha_search=True, l1_ratio=1, fit_intercept=True)
t_model.fit(X_train, y_train, sample_weight=weights_train)




In [None]:
print(len(t_model.coef_))
len(t_model.feature_names_)

pd.DataFrame({"coefficients": t_model.coef_, "names": t_model.feature_names_}).T


59


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,58
coefficients,0.0,0.099953,0.370635,-0.154199,0.187329,-0.155599,-0.015779,-0.001732,0.083403,0.013485,...,0.012421,-0.229012,-0.14127,-0.109797,0.01156,0.007467,-0.032344,0.149581,0.032305,0.000006
names,VehBrand[B1],VehBrand[B10],VehBrand[B11],VehBrand[B12],VehBrand[B13],VehBrand[B14],VehBrand[B2],VehBrand[B3],VehBrand[B4],VehBrand[B5],...,VehAge[1],VehAge[2],VehPower[4],VehPower[5],VehPower[6],VehPower[7],VehPower[8],VehPower[9],BonusMalus,Density


In [55]:
# Asessing model performance
print('training loss = {}'.format(
    TweedieDist.deviance(y_train, t_model.predict(X_train), sample_weight=weights_train)/np.sum(weights_train)
))

print('testing loss = {}'.format(
    TweedieDist.deviance(y_test, t_model.predict(X_test), sample_weight=weights_test)/np.sum(weights_test)
))

# Predicted vs observed total claim amount
print('Total claim amount on train set, observed: {}, predicted: {}'.
      format(np.sum(df[df["sample"] == True]["ClaimAmountCut"]), 
             np.sum(t_model.predict(X_train) * weights_train)))

print('Total claim amount on test set, observed: {}, predicted: {}'.
      format(np.sum(df[df["sample"] == False]["ClaimAmountCut"]), 
             np.sum(t_model.predict(X_test) * weights_test)))

training loss = 73.47287388628281
testing loss = 74.86435130744907
Total claim amount on train set, observed: 39180777.71000001, predicted: 39524356.12970478
Total claim amount on test set, observed: 10121418.339999998, predicted: 9877096.531121776
