#### Setting up the Gamma GLM

In [11]:
# Imports:
import pandas as pd  # For setting up the data frame
from sklearn.preprocessing import StandardScaler  # For normalizing the numerical values
import joblib  # Persistent model

# Constants:
MODEL_NAME = 'gamma-glm.joblib'
SCORE_DATASET_FILE_NAME = 'scoring-dataset.csv'

INDEP_VAR = ['pol_duration', 'drv_age1', 'drv_age_lic1', 'drv_age2', 'drv_age_lic2', 'vh_age', 'vh_value', 'vh_weight']
DEP_VAR = 'claim_amount'

NUM_VAR = ['pol_duration', 'vh_age', 'vh_value/kg', 'lic1_age', 'lic2_age']
TEMP_VAR = ['drv_age1', 'drv_age_lic1', 'drv_age2', 'drv_age_lic2', 'vh_value', 'vh_weight']

SUBMISSION_FILE_NAME = 'submission.csv'

***
#### Making and cleaning the data frame

In [12]:
# Data frame and input set:
score_df = pd.read_csv(SCORE_DATASET_FILE_NAME, usecols=INDEP_VAR)

# Cleaning the data frame:
score_df = score_df.fillna(score_df.median())  # Replaces NA entries with 0 for drv_age2, drv_age_lic2
score_df['vh_weight'] = score_df['vh_weight'].replace(to_replace=0, value=score_df['vh_weight'].median())  # Replaces zero entries with vh_weight's median

# Combining variables:
score_df['lic1_age'] = score_df.apply(lambda row: row['drv_age1'] - row['drv_age_lic1'], axis=1)
score_df['lic2_age'] = score_df.apply(lambda row: row['drv_age2'] - row['drv_age_lic2'], axis=1)
score_df['vh_value/kg'] = score_df.apply(lambda row: row['vh_value'] / row['vh_weight'], axis=1)

score_df = score_df.drop(columns=TEMP_VAR)

# Scaling:
scaler = StandardScaler()
scaled_var = pd.DataFrame(scaler.fit_transform((score_df[NUM_VAR])), columns=NUM_VAR)  # Normalizing the numerical variables
score_df = score_df.drop(columns=NUM_VAR)  # Dropping the old numerical variables
score_df = score_df.join(scaled_var)  # Joining the new numerical variables

***
#### Predicting and exporting the predictions

In [13]:
# Loading the Gamma GLM:
gamma_glm = joblib.load(MODEL_NAME)

# Predictions data frame:
pred_df = pd.DataFrame(gamma_glm.predict(score_df), columns=['claim_amount'])

# Policy ID data frame:
id_policy_df = pd.read_csv(SCORE_DATASET_FILE_NAME, usecols=['id_policy'])

# Exporting the submission data frame as a csv file:
id_policy_df.join(pred_df).to_csv(SUBMISSION_FILE_NAME, index=False)