In [41]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
from tabulate import tabulate
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from src.data_quality_checks import check_missing_data
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import shap

# Data Preparation

Load and prepare the data

In [39]:
df = pd.read_csv('../data/MachineLearningRating_v3.txt', sep='|', low_memory=False)

# df = pd.read_csv('../data/cleaned_data.csv', header=None,low_memory=False)

In [40]:
# selecting relevant columns
relevant_cols = ['TotalPremium', 'TotalClaims', 'Gender', 'PostalCode', 'Province', 'VehicleType', 'RegistrationYear', 'NewVehicle']

df_relevant_cols = df[relevant_cols]

In [42]:
# checking for columns with missing values
missing_data = check_missing_data(df_relevant_cols)
print(missing_data)

   Column Name  Missing Values  Percentage Missing
2       Gender            9536            0.953507
5  VehicleType             552            0.055195
7   NewVehicle          153295           15.327998


In [43]:
# data cleaning
df_relevant_cols.loc[:, 'VehicleType'] = df_relevant_cols['VehicleType'].fillna(df_relevant_cols['VehicleType'].mode()[0])
df_relevant_cols.loc[:, 'NewVehicle'] = df_relevant_cols['NewVehicle'].fillna(df_relevant_cols['NewVehicle'].mode()[0])
df_relevant_cols.loc[:, 'Gender'] = df_relevant_cols['Gender'].fillna(df_relevant_cols['Gender'].mode()[0])

In [44]:
# checking for columns with missing values
missing_data = check_missing_data(df_relevant_cols)
print(missing_data)

Success: No missing values.


In [45]:
df_relevant_cols.head(10)

Unnamed: 0,TotalPremium,TotalClaims,Gender,PostalCode,Province,VehicleType,RegistrationYear,NewVehicle
0,21.929825,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,More than 6 months
1,21.929825,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,More than 6 months
2,0.0,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,More than 6 months
3,512.84807,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,More than 6 months
4,0.0,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,More than 6 months
5,3.256435,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,More than 6 months
6,50.474737,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,More than 6 months
7,35.332316,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,More than 6 months
8,0.0,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,More than 6 months
9,1.009474,0.0,Not specified,1459,Gauteng,Passenger Vehicle,2004,More than 6 months


Encoding categorical data

In [46]:

categorical_features = ['Province', 'PostalCode', 'Gender', 'VehicleType', 'RegistrationYear', 'NewVehicle']
encoder = LabelEncoder()

for feature in categorical_features:
    df_relevant_cols[feature] = encoder.fit_transform(df_relevant_cols[feature])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant_cols[feature] = encoder.fit_transform(df_relevant_cols[feature])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant_cols[feature] = encoder.fit_transform(df_relevant_cols[feature])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant_cols[feature] = encoder.fit_transform(d

In [47]:
df_relevant_cols.head(10)

Unnamed: 0,TotalPremium,TotalClaims,Gender,PostalCode,Province,VehicleType,RegistrationYear,NewVehicle
0,21.929825,0.0,2,261,2,4,13,1
1,21.929825,0.0,2,261,2,4,13,1
2,0.0,0.0,2,261,2,4,13,1
3,512.84807,0.0,2,261,2,4,13,1
4,0.0,0.0,2,261,2,4,13,1
5,3.256435,0.0,2,261,2,4,13,1
6,50.474737,0.0,2,261,2,4,13,1
7,35.332316,0.0,2,261,2,4,13,1
8,0.0,0.0,2,261,2,4,13,1
9,1.009474,0.0,2,261,2,4,13,1


Split data into features and target variable

In [48]:

X = df_relevant_cols.drop(['TotalClaims', 'TotalPremium'], axis=1)
y = df_relevant_cols['TotalClaims']  # we are predicting TotalClaims
