#### Setting up the Gamma GLM

In [346]:
# Imports:
import pandas as pd  # For setting up the data frame
from sklearn.preprocessing import StandardScaler  # For normalizing the numerical values
from sklearn.linear_model import GammaRegressor  # For the Gamma GLM
from sklearn.model_selection import train_test_split  # For setting up the train-test modularization
from sklearn.metrics import root_mean_squared_error
import numpy as np
import joblib  # Persistent model

# Constants:
DATASET_FILE_NAME = 'training-dataset.csv'

INDEP_VAR = ['pol_duration', 'drv_age1', 'drv_age_lic1', 'drv_age2', 'drv_age_lic2', 'vh_age', 'vh_value', 'vh_weight']
DEP_VAR = 'claim_amount'

NUM_VAR = ['pol_duration', 'vh_age', 'vh_value/kg', 'lic1_age', 'lic2_age']
TEMP_VAR = ['drv_age1', 'drv_age_lic1', 'drv_age2', 'drv_age_lic2', 'vh_value', 'vh_weight']

MODEL_NAME = 'gamma-glm.joblib'

***
#### Making and cleaning the data frame

In [347]:
# Data frame and input set:
X, y = pd.read_csv(DATASET_FILE_NAME, usecols=INDEP_VAR), pd.read_csv(DATASET_FILE_NAME)[DEP_VAR]

# Cleaning the data frame:
X = X.fillna(X.median())  # Replaces NA entries with the corresponding median for drv_age2, drv_age_lic2
X['vh_weight'] = X['vh_weight'].replace(to_replace=0, value=X['vh_weight'].median())  # Replaces zero entries with vh_weight's median

# Combining variables:
X['lic1_age'] = X.apply(lambda row: row['drv_age1'] - row['drv_age_lic1'], axis=1)
X['lic2_age'] = X.apply(lambda row: row['drv_age2'] - row['drv_age_lic2'], axis=1)
X['vh_value/kg'] = X.apply(lambda row: row['vh_value'] / row['vh_weight'], axis=1)

X = X.drop(columns=TEMP_VAR)

# Scaling:
scaler = StandardScaler()
scaled_var = pd.DataFrame(scaler.fit_transform((X[NUM_VAR])), columns=NUM_VAR)  # Normalizing the numerical variables
X = X.drop(columns=NUM_VAR)  # Dropping the old numerical variables
X = X.join(scaled_var)  # Joining the new numerical variables

***
#### Setting up the train-test modularization and Gamma GLM

In [348]:
# Training and testing sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Gamma GLM:
gamma_glm = GammaRegressor()
# gamma_glm.fit(X_train, y_train)
gamma_glm.fit(X, y)

***
#### Predicting the test set and calculating the accuracy

In [349]:
# Predictions:
predictions = gamma_glm.predict(X_test)

# RMSE:
RMSE = root_mean_squared_error(y_test, predictions)
actual_mean = y_test.mean()

data = [RMSE, predictions.mean(), actual_mean, RMSE / actual_mean] 
data = np.round(np.array(data), 3)

print("RMSE:", data[0])
print("Pred Mean:", data[1])
print("Actual Mean:", data[2])
print("RMSE / Actual Mean:", data[3])

RMSE: 956.754
Pred Mean: 1049.375
Actual Mean: 1019.598
RMSE / Actual Mean: 0.938


***
#### Exporting the model

In [350]:
# Persistent model:
joblib.dump(gamma_glm, MODEL_NAME)

['gamma-glm.joblib']