#### Setting up the Gamma GLM

In [5]:
# Imports:
import pandas as pd  # For setting up the data frame
from sklearn.preprocessing import StandardScaler  # For normalizing the numerical values
from sklearn.linear_model import GammaRegressor  # For the Gamma GLM
from sklearn.model_selection import cross_val_score  # For setting up the cross-validation
import numpy as np
import joblib  # Persistent model

# Constants:
DATASET_FILE_NAME = 'training-dataset.csv'

INDEP_VAR = ['pol_duration', 'drv_age1', 'drv_age_lic1', 'drv_age2', 'drv_age_lic2', 'vh_age', 'vh_value', 'vh_weight']
DEP_VAR = 'claim_amount'

NUM_VAR = ['pol_duration', 'vh_age', 'vh_value/kg', 'lic1_age', 'lic2_age']
TEMP_VAR = ['drv_age1', 'drv_age_lic1', 'drv_age2', 'drv_age_lic2', 'vh_value', 'vh_weight']

MODEL_NAME = 'gamma-glm.joblib'

***
#### Making and cleaning the data frame

In [6]:
# Data frame and input set:
X, y = pd.read_csv(DATASET_FILE_NAME, usecols=INDEP_VAR), pd.read_csv(DATASET_FILE_NAME)[DEP_VAR]

# Cleaning the data frame:
X = X.fillna(X.median())  # Replaces NA entries with the corresponding median for drv_age2, drv_age_lic2
X['vh_weight'] = X['vh_weight'].replace(to_replace=0, value=X['vh_weight'].median())  # Replaces zero entries with vh_weight's median

# Combining variables:
X['lic1_age'] = X.apply(lambda row: row['drv_age1'] - row['drv_age_lic1'], axis=1)
X['lic2_age'] = X.apply(lambda row: row['drv_age2'] - row['drv_age_lic2'], axis=1)
X['vh_value/kg'] = X.apply(lambda row: row['vh_value'] / row['vh_weight'], axis=1)

X = X.drop(columns=TEMP_VAR)

# Scaling:
scaler = StandardScaler()
scaled_var = pd.DataFrame(scaler.fit_transform((X[NUM_VAR])), columns=NUM_VAR)  # Normalizing the numerical variables
X = X.drop(columns=NUM_VAR)  # Dropping the old numerical variables
X = X.join(scaled_var)  # Joining the new numerical variables

***
#### Setting up the cross-validation and Gamma GLM

In [7]:
# Gamma GLM:
gamma_glm = GammaRegressor()

# RMSE:
RMSE = pd.DataFrame(np.negative(np.round(cross_val_score(gamma_glm, X, y, scoring='neg_root_mean_squared_error'), 3)), columns=['RMSE'])
display(RMSE)

Unnamed: 0,RMSE
0,946.435
1,959.153
2,933.228
3,939.493
4,918.529


***
#### Exporting the model

In [8]:
# Persistent model:
gamma_glm.fit(X, y)
joblib.dump(gamma_glm, MODEL_NAME)

['gamma-glm.joblib']