In [1]:
# Importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor as RFR

In [2]:
# Importing necessary libraries for modeling our data
import pandas as pd
from sklearn.model_selection import train_test_split



In [3]:
# Reading in the data
numerical_df = pd.read_csv('numercal.csv')
categorical_df = pd.read_csv('categorical.csv')


In [4]:
# Preprocessing the categorical data
categorical_df.columns = categorical_df.columns.str.lower().str.replace(' ', '_')
categorical_df.drop('effective_to_date_-_year', axis=1, inplace=True)

In [5]:
# Merging the numerical and categorical dataframes on the 'customer' column
data = pd.merge(numerical_df, categorical_df, on='customer')

In [6]:
data.head(10)

Unnamed: 0,customer,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,total_claim_amount,state,response,coverage,...,gender,location_code,marital_status,policy_type,policy_level,renew_offer_type,sales_channel,vehicle_class,vehicle_size,effective_to_date_-_month
0,BU79786,2763.519279,56274,69,32,5,384.811147,Washington,No,Basic,...,F,Suburban,Married,Corporate Auto,3,1,Agent,Standard,Medsize,2
1,QZ44356,6979.535903,0,94,13,42,1131.464935,Arizona,No,Extended,...,F,Suburban,Single,Personal Auto,3,3,Agent,Standard,Medsize,1
2,AI49188,12887.43165,48767,108,18,38,566.472247,Nevada,No,Premium,...,F,Suburban,Married,Personal Auto,3,1,Agent,Standard,Medsize,2
3,WW63253,7645.861827,0,106,18,65,529.881344,California,No,Basic,...,M,Suburban,Married,Corporate Auto,2,1,Call Center,Sports,Medsize,1
4,HB64268,2813.692575,43836,73,12,44,138.130879,Washington,No,Basic,...,M,Rural,Single,Personal Auto,1,1,Agent,Standard,Medsize,2
5,OC83172,8256.2978,62902,69,14,94,159.383042,Oregon,Yes,Basic,...,F,Rural,Married,Personal Auto,3,2,Web,Standard,Medsize,1
6,XZ87318,5380.898636,55350,67,0,13,321.6,Oregon,Yes,Basic,...,F,Suburban,Married,Corporate Auto,3,1,Agent,Standard,Medsize,2
7,CF85061,7216.100311,0,101,0,68,363.02968,Arizona,No,Premium,...,M,Urban,Single,Corporate Auto,3,1,Agent,Standard,Medsize,1
8,DY87989,24127.50402,14072,71,13,3,511.2,Oregon,Yes,Basic,...,M,Suburban,Divorced,Corporate Auto,3,1,Agent,Standard,Medsize,1
9,BQ94931,7388.178085,28812,93,17,7,425.527834,Oregon,No,Extended,...,F,Urban,Married,Special Auto,2,2,Branch,Standard,Medsize,2


In [8]:
# Converting 'effective_to_date' to a datetime object and changing column data types
data['effective_to_date'] = pd.to_datetime(data['effective_to_date'])
data[['renew_offer_type', 'policy_level']] = data[['renew_offer_type', 'policy_level']].astype('object')

In [9]:
# Separating the data into features (X) and target (y)
y = data['total_claim_amount']
X = data.drop(['customer', 'total_claim_amount', 'effective_to_date'], axis=1)

In [10]:
# Creating a train-test split of the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=405)

In [31]:
# Importing the necessary libraries
from sklearn.preprocessing import MinMaxScaler

# Selecting the numeric columns from the training data
X_train_num = X_train.select_dtypes(include = np.number)

# Scaling the numeric training data using MinMaxScaler
scaler = MinMaxScaler()
X_train_num_norm = scaler.fit_transform(X_train_num)

# Converting the normalized data back to a pandas DataFrame and renaming the columns
X_train_num_norm = pd.DataFrame(X_train_num_norm,columns=X_train_num.columns)

# Displaying the first few rows of the normalized DataFrame
X_train_num_norm.head() 

# The code applies the MinMaxScaler to the numeric columns of the training data to normalize the values between 0 and 1. 
# The result is then converted to a pandas DataFrame and the column names are preserved. 
# The normalized DataFrame is displayed to verify the successful normalization of the data.


Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception
0,0.036476,0.310769,0.21097,0.0,0.575758
1,0.024533,0.518238,0.130802,0.8,0.89899
2,0.009478,0.432122,0.012658,0.885714,0.191919
3,0.04287,0.0,0.278481,0.685714,0.585859
4,0.047938,0.0,0.046414,0.628571,0.585859


In [16]:
# Selecting the categorical columns from the training data
X_train_cat = X_train.select_dtypes(include = object)

In [32]:
# Importing the necessary libraries
from sklearn.preprocessing import OneHotEncoder

# Selecting the categorical columns to be one-hot encoded
X_train_cat_ohe = X_train_cat[['state', 'employmentstatus', 'location_code', 'marital_status', 'policy_type', 'renew_offer_type', 'sales_channel', 'vehicle_class']]

# Fitting and applying OneHotEncoder to the selected columns
encoder = OneHotEncoder(drop='first').fit(X_train_cat_ohe)
encoded = encoder.transform(X_train_cat_ohe).toarray()

# Creating column names for the encoded features
cols = encoder.get_feature_names_out(input_features=X_train_cat_ohe.columns)

# Converting the encoded features back to a pandas DataFrame and renaming the columns
X_train_cat_ohe = pd.DataFrame(encoded, columns=cols)

# Displaying the first few rows of the one-hot encoded DataFrame
X_train_cat_ohe.head()

# This code selects the categorical columns from the training data that we want to encode using one-hot encoding. 
# It then uses the OneHotEncoder to fit and transform the selected columns into an array of binary-encoded features.
# The encoded feature names are created and the array is converted back to a pandas DataFrame. 
# The columns in the encoded DataFrame are then named with the feature names, and the first few rows of the encoded DataFrame are displayed.


Unnamed: 0,state_California,state_Nevada,state_Oregon,state_Washington,employmentstatus_Employed,employmentstatus_Medical Leave,employmentstatus_Retired,employmentstatus_Unemployed,location_code_Suburban,location_code_Urban,...,policy_type_Personal Auto,policy_type_Special Auto,renew_offer_type_2,renew_offer_type_3,renew_offer_type_4,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Sports,vehicle_class_Standard
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [33]:
# Map categorical data to numerical values
X_train_cat["coverage"] = X_train_cat["coverage"].map({"Basic" : 0, "Extended" : 1, "Premium" : 2})
X_train_cat["policy_level"] = X_train_cat["policy_level"].map({1 : 0, 2 : 1, 3 : 2})
X_train_cat["vehicle_size"] = X_train_cat["vehicle_size"].map({"Small" : 0, "Medsize" : 1, "Large" : 2})
X_train_cat["effective_to_day_of_week"] = X_train_cat["effective_to_day_of_week"].map({"Monday" : 0,"Tuesday" : 1,"Wednesday" : 2,"Thursday" : 3,"Friday" : 4,"Saturday" : 5,"Sunday" : 6})
X_train_cat["effective_to_date_month"] = X_train_cat["effective_to_date_month"].map({1 : 0, 2 : 1})


In [19]:
X_train_cat_ord=X_train_cat[['coverage','policy_level','vehicle_size','effective_to_day_of_week','effective_to_date_month']]
X_train_cat_ord = X_train_cat_ord.reset_index()
X_train_cat_ord = X_train_cat_ord.drop(['index'], axis=1)
X_train_cat_ord.head()

Unnamed: 0,coverage,policy_level,vehicle_size,effective_to_day_of_week,effective_to_date_month
0,0,1,0,3,0
1,1,1,1,6,0
2,0,0,1,3,0
3,1,1,0,1,0
4,0,2,2,4,0


In [34]:
# Selecting the categorical columns with ordinal data
X_train_cat_ord = X_train_cat[['coverage','policy_level','vehicle_size','effective_to_day_of_week','effective_to_date_month']]

# Resetting index to avoid conflicts during concatenation
X_train_cat_ord = X_train_cat_ord.reset_index()

# Dropping the 'index' column
X_train_cat_ord = X_train_cat_ord.drop(['index'], axis=1)

X_train_cat_ord.head()


Unnamed: 0,coverage,policy_level,vehicle_size,effective_to_day_of_week,effective_to_date_month
0,,0.0,,,
1,,0.0,,,
2,,,,,
3,,0.0,,,
4,,1.0,,,


In [35]:
display(X_train_transformed.shape)

(7303, 32)

In [36]:
# Regular check of NAs and shape of DF after every concatenation
display(X_train_transformed.isna().sum())


customer_lifetime_value           0
income                            0
monthly_premium_auto              0
months_since_last_claim           0
months_since_policy_inception     0
state_California                  0
state_Nevada                      0
state_Oregon                      0
state_Washington                  0
employmentstatus_Employed         0
employmentstatus_Medical Leave    0
employmentstatus_Retired          0
employmentstatus_Unemployed       0
location_code_Suburban            0
location_code_Urban               0
marital_status_Married            0
marital_status_Single             0
policy_type_Personal Auto         0
policy_type_Special Auto          0
renew_offer_type_2                0
renew_offer_type_3                0
renew_offer_type_4                0
sales_channel_Branch              0
sales_channel_Call Center         0
sales_channel_Web                 0
vehicle_class_Sports              0
vehicle_class_Standard            0
coverage                    

In [37]:
# Normalize numeric columns of test data using the same transformer used on training data
X_test_num = X_test.select_dtypes(include = np.number)
X_test_num_norm = pd.DataFrame(transformer.transform(X_test_num), columns=X_test_num.columns)
X_test_num_norm.head()


Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception
0,0.077583,0.268111,0.008439,0.914286,0.191919
1,0.076398,0.0,0.130802,0.114286,0.606061
2,0.082584,0.270771,0.135021,0.685714,0.272727
3,0.03751,0.345226,0.219409,0.342857,0.707071
4,0.161385,0.222382,0.417722,0.0,0.929293


In [38]:
X_test_cat = X_test.select_dtypes(include = object)

In [39]:
X_test_cat_ohe=X_test_cat[['state','employmentstatus','location_code','marital_status','policy_type','renew_offer_type','sales_channel','vehicle_class']]
encoded = encoder.transform(X_test_cat_ohe).toarray()
cols = encoder.get_feature_names_out(input_features=X_test_cat_ohe.columns)
X_test_cat_ohe = pd.DataFrame(encoded, columns=cols)
X_test_cat_ohe.head()

Unnamed: 0,state_California,state_Nevada,state_Oregon,state_Washington,employmentstatus_Employed,employmentstatus_Medical Leave,employmentstatus_Retired,employmentstatus_Unemployed,location_code_Suburban,location_code_Urban,...,policy_type_Personal Auto,policy_type_Special Auto,renew_offer_type_2,renew_offer_type_3,renew_offer_type_4,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Sports,vehicle_class_Standard
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [40]:
# Map categorical variables to ordered values using dictionaries
coverage_map = {"Basic": 0, "Extended": 1, "Premium": 2}
policy_level_map = {1: 0, 2: 1, 3: 2}
vehicle_size_map = {"Small": 0, "Medsize": 1, "Large": 2}
day_of_week_map = {"Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, "Friday": 4, "Saturday": 5, "Sunday": 6}

X_test_cat["coverage"] = X_test_cat["coverage"].map(coverage_map)
X_test_cat["policy_level"] = X_test_cat["policy_level"].map(policy_level_map)
X_test_cat["vehicle_size"] = X_test_cat["vehicle_size"].map(vehicle_size_map)
X_test_cat["effective_to_day_of_week"] = X_test_cat["effective_to_day_of_week"].map(day_of_week_map)
X_test_cat["effective_to_date_month"] = X_test_cat["effective_to_date_month"].map({1: 0, 2: 1})

# Select the ordered categorical columns, reset the index and drop the old index column
X_test_cat_ord = X_test_cat[['coverage', 'policy_level', 'vehicle_size', 'effective_to_day_of_week', 'effective_to_date_month']]
X_test_cat_ord = X_test_cat_ord.reset_index(drop=True)
X_test_cat_ord.head()


Unnamed: 0,coverage,policy_level,vehicle_size,effective_to_day_of_week,effective_to_date_month
0,0,1,1,6,0
1,1,0,1,0,0
2,1,2,2,2,1
3,2,1,1,4,0
4,2,2,1,1,1


In [41]:
# Concatenating transformed numerical and categorical columns (one-hot encoded and ordinal)
X_test_transformed = pd.concat([X_test_num_norm, X_test_cat_ohe, X_test_cat_ord], axis=1)

# Displaying the transformed test dataset
X_test_transformed.head()

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,state_California,state_Nevada,state_Oregon,state_Washington,employmentstatus_Employed,...,sales_channel_Branch,sales_channel_Call Center,sales_channel_Web,vehicle_class_Sports,vehicle_class_Standard,coverage,policy_level,vehicle_size,effective_to_day_of_week,effective_to_date_month
0,0.077583,0.268111,0.008439,0.914286,0.191919,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0,1,1,6,0
1,0.076398,0.0,0.130802,0.114286,0.606061,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1,0,1,0,0
2,0.082584,0.270771,0.135021,0.685714,0.272727,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,1,2,2,2,1
3,0.03751,0.345226,0.219409,0.342857,0.707071,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,2,1,1,4,0
4,0.161385,0.222382,0.417722,0.0,0.929293,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,2,2,1,1,1


In [27]:
# Regular check of NAs and shape of DF after every concatenation
display(X_test_transformed.isna().sum())
display(X_test_transformed.shape)

customer_lifetime_value           0
income                            0
monthly_premium_auto              0
months_since_last_claim           0
months_since_policy_inception     0
state_California                  0
state_Nevada                      0
state_Oregon                      0
state_Washington                  0
employmentstatus_Employed         0
employmentstatus_Medical Leave    0
employmentstatus_Retired          0
employmentstatus_Unemployed       0
location_code_Suburban            0
location_code_Urban               0
marital_status_Married            0
marital_status_Single             0
policy_type_Personal Auto         0
policy_type_Special Auto          0
renew_offer_type_2                0
renew_offer_type_3                0
renew_offer_type_4                0
sales_channel_Branch              0
sales_channel_Call Center         0
sales_channel_Web                 0
vehicle_class_Sports              0
vehicle_class_Standard            0
coverage                    

(1826, 32)

In [42]:
# Importing necessary libraries
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')

# Initializing and fitting the linear regression model
lm = linear_model.LinearRegression()
lm.fit(X_train_transformed, y_train)

# Predicting target variable for test data
predictions_test = lm.predict(X_test_transformed)

# Calculating r-squared score
r2 = round(r2_score(y_test, predictions_test), 2)

# Printing the r-squared score
print("r2_score: ", r2)


r2_score:  0.77


In [44]:
def model_check(model_list,X_train,y_train,X_test,y_test):
    # Loop through the model_list
    for model in model_list:
        # If the model is linear regression
        if model == 'lm':
            # Initialize the model and fit to training data
            lm = linear_model.LinearRegression()
            lm.fit(X_train, y_train)
            # Print results
            print("Linear Regression:")
            print("r2_score (train): ", round(r2_score(y_train, lm.predict(X_train)), 2))    
            print("r2_score (test): ", round(r2_score(y_test, lm.predict(X_test)), 2))
        # If the model is K Nearest Neighbors
        elif model == 'knn':
            # Initialize the model and fit to training data
            knn = KNeighborsRegressor()
            knn.fit(X_train, y_train)
            # Print results
            print("K Nearest Neighbors:")
            print("r2_score (train): ", round(r2_score(y_train, knn.predict(X_train)), 2))    
            print("r2_score (test): ", round(r2_score(y_test, knn.predict(X_test)), 2))
        # If the model is Multi-layer Perceptron
        elif model == 'mlp':
            # Initialize the model and fit to training data
            mlp = MLPRegressor(random_state=1, max_iter=500)
            mlp.fit(X_train, y_train)
            # Print results
            print("Multi-layer Perceptron:")
            print("r2_score (train): ", round(r2_score(y_train, mlp.predict(X_train)), 2))    
            print("r2_score (test): ", round(r2_score(y_test, mlp.predict(X_test)), 2))


In [45]:
model_list=['lm','knn','mlp']
model_check(model_list,X_train_transformed,y_train,X_test_transformed,y_test)

Linear Regression:
r2_score (train):  0.77
r2_score (test):  0.77
K Nearest Neighbors:
r2_score (train):  0.72
r2_score (test):  0.59
Multi-layer Perceptron:
r2_score (train):  0.83
r2_score (test):  0.82
