#### Setting up the machine learning model

In [14]:
# Imports:
import time  # For getting runtime
import pandas as pd  # For setting up the data frame
from sklearn.preprocessing import StandardScaler  # For scaling the numerical variables
import joblib  # Persistent model

start_time = time.time()

# Constants:
MODEL_NAME = 'insurance-model.joblib'
SCORE_DATASET_FILE_NAME = 'scoring-dataset.csv'

INDEP_VAR = ['Gender', 'policyHolderAge', 'hasCanadianDrivingLicense', 'territory', 'hasAutoInsurance', 'hadVehicleClaimInPast', 'homeInsurancePremium', 'isOwner', 'rentedVehicle', 'hasMortgage', 'nbWeeksInsured', 'vehicleStatus']

CAT_VAR = ['Gender', 'territory', 'hadVehicleClaimInPast', 'vehicleStatus']
NUM_VAR = ['policyHolderAge', 'homeInsurancePremium', 'nbWeeksInsured']

SUBMISSION_FILE_NAME = 'submission.csv'

***
#### Making and cleaning the data frame

In [10]:
# Scoring data frame:
score_df = pd.read_csv(SCORE_DATASET_FILE_NAME, usecols=INDEP_VAR)

# Cleaning the data frame:
score_df = score_df.fillna(0)  # Replaces NA entries with 0 for hasMortgage

score_df = pd.get_dummies(score_df, columns=CAT_VAR, drop_first=True)  # Converts CAT_VAR into boolean variables

scaler = StandardScaler()
scaled_var = pd.DataFrame(scaler.fit_transform((score_df[NUM_VAR])), columns=NUM_VAR)  # Scaling the numerical variables
score_df = score_df.drop(columns=NUM_VAR)  # Dropping the old numerical variables
score_df = score_df.join(scaled_var)  # Joining the new numerical variables

***
#### Predicting and exporting the predictions

In [17]:
# Loading the logistic regression model:
log_reg = joblib.load(MODEL_NAME)

# Predictions data frame:
pred_df = pd.DataFrame(log_reg.predict(score_df), columns=['predictedResponseVariable'])

# Policy ID data frame:
policyId_df = pd.read_csv(SCORE_DATASET_FILE_NAME, usecols=['policyId'])

# Exporting the submission data frame as a csv file:
policyId_df.join(pred_df).to_csv(SUBMISSION_FILE_NAME, index=False)

# Execution time:
end_time = time.time()
print("Execution time:", end_time - start_time, "seconds")

# Number of 1s:
print(pred_df['predictedResponseVariable'].value_counts())

Execution time: 91.4195306301117 seconds
0    40118
Name: predictedResponseVariable, dtype: int64
