### Exploring The Data

In [1]:
# Import necessary libraries

import pandas as pd

In [2]:
# Read the given csv file

df = pd.read_csv('./data/train.csv')

### Missing values are either marked as " " or '?' but for ease of handling fill NaN with "?"

In [3]:
 df = df.fillna('?')

### 1. Data Pre processing

#### a. Method to deal with missing data

In [4]:
# First let's see how much missing data each column has

for i in df.columns:
    print(i + ' : ' + str(df[i].isin(['?']).sum().astype(int)))

Row_ID : 0
Household_ID : 0
Vehicle : 0
Calendar_Year : 0
Model_Year : 0
Blind_Make : 15
Blind_Model : 15
Blind_Submodel : 15
Cat1 : 50
Cat2 : 10591
Cat3 : 11
Cat4 : 12985
Cat5 : 12998
Cat6 : 50
Cat7 : 16480
Cat8 : 2
Cat9 : 0
Cat10 : 10
Cat11 : 58
Cat12 : 52
OrdCat : 19
Var1 : 0
Var2 : 0
Var3 : 0
Var4 : 0
Var5 : 0
Var6 : 0
Var7 : 0
Var8 : 0
NVCat : 0
NVVar1 : 0
NVVar2 : 0
NVVar3 : 0
NVVar4 : 0
Claim_Amount : 0


##### We have many methods to handle with missing data like Case Deletion, Mean Substitution, Regression Imputation ** etc.

** https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3668100/

##### Choosing the categorical variables

In [5]:
# Here I have choosen Cat 1, Cat 3, Cat 6, Cat 8, Cat 9, Cat 10, Cat11, Cat12
# As we can see from the code above that they have the least number of missing data 

df = df.drop(columns = ['Cat2', 'Cat4', 'Cat5', 'Cat7'])
df.columns

Index(['Row_ID', 'Household_ID', 'Vehicle', 'Calendar_Year', 'Model_Year',
       'Blind_Make', 'Blind_Model', 'Blind_Submodel', 'Cat1', 'Cat3', 'Cat6',
       'Cat8', 'Cat9', 'Cat10', 'Cat11', 'Cat12', 'OrdCat', 'Var1', 'Var2',
       'Var3', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8', 'NVCat', 'NVVar1',
       'NVVar2', 'NVVar3', 'NVVar4', 'Claim_Amount'],
      dtype='object')

##### Now that I have dropped features with so much missing data we can see below the total rows with missing data in any of the features constitutes of only 0.3 % of the complete case

In [6]:
for i in df.columns:
    print(i + ' : ' + str(df[i].isin(['?']).sum().astype(int)))

Row_ID : 0
Household_ID : 0
Vehicle : 0
Calendar_Year : 0
Model_Year : 0
Blind_Make : 15
Blind_Model : 15
Blind_Submodel : 15
Cat1 : 50
Cat3 : 11
Cat6 : 50
Cat8 : 2
Cat9 : 0
Cat10 : 10
Cat11 : 58
Cat12 : 52
OrdCat : 19
Var1 : 0
Var2 : 0
Var3 : 0
Var4 : 0
Var5 : 0
Var6 : 0
Var7 : 0
Var8 : 0
NVCat : 0
NVVar1 : 0
NVVar2 : 0
NVVar3 : 0
NVVar4 : 0
Claim_Amount : 0


##### So I have chosen Listwise or Case Deletion in this particular case.

In [7]:
df = df[df['Cat1'] != '?']
df = df[df['Cat3'] != '?']
df = df[df['Cat6'] != '?']
df = df[df['Cat8'] != '?']
df = df[df['Cat9'] != '?']
df = df[df['Cat10'] != '?']
df = df[df['Cat11'] != '?']
df = df[df['Cat12'] != '?']
df = df[df['Blind_Make'] != '?']
df = df[df['Blind_Model'] != '?']
df = df[df['Blind_Submodel'] != '?']
df = df[df['OrdCat'] != '?']

In [8]:
for i in df.columns:
    print(i + ' : ' + str(df[i].isin(['?']).sum().astype(int)))

Row_ID : 0
Household_ID : 0
Vehicle : 0
Calendar_Year : 0
Model_Year : 0
Blind_Make : 0
Blind_Model : 0
Blind_Submodel : 0
Cat1 : 0
Cat3 : 0
Cat6 : 0
Cat8 : 0
Cat9 : 0
Cat10 : 0
Cat11 : 0
Cat12 : 0
OrdCat : 0
Var1 : 0
Var2 : 0
Var3 : 0
Var4 : 0
Var5 : 0
Var6 : 0
Var7 : 0
Var8 : 0
NVCat : 0
NVVar1 : 0
NVVar2 : 0
NVVar3 : 0
NVVar4 : 0
Claim_Amount : 0


In [9]:
# update df to new values without missing values

df.to_csv('./data/train-copy.csv', index=False)
df = pd.read_csv('./data/train-copy.csv')

In [10]:
df = df.astype({'Row_ID': 'float64', 'Household_ID': 'float64', 'Vehicle': 'float64', 'Calendar_Year': 'float64', 'Model_Year': 'float64', 'OrdCat': 'float64',})

#### b. Convert categorical values to a suitable representation

In [11]:
import numpy as np

##### Here I have used one-hot Encoding

In [12]:
attributes_cat = ['Cat1', 'Cat3', 'Cat6', 'Cat8', 'Cat9', 'Cat10', 'Cat11', 'Cat12', 'OrdCat', 'NVCat']
attributes_num = ['Household_ID', 'Vehicle', 'Calendar_Year', 'Model_Year', 'Var1', \
                  'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8',
                 'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4']  

In [13]:
# Apply column transformation as per data type of the columns

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer


full_transform = ColumnTransformer([
    ("num", StandardScaler(), attributes_num),
    ("cat", OneHotEncoder(handle_unknown = "ignore"), attributes_cat),
])


#### c. Balancing the Data set

In [14]:
df_zeroes = df[df['Claim_Amount'] == 0]
df_non_zeroes = df[df['Claim_Amount'] != 0]

nz = df_non_zeroes['Claim_Amount'].count()

z = df_zeroes['Claim_Amount'].count()

if nz != z:
    if nz > z:
        df_non_zeroes = df_non_zeroes.sample(n = z)
    elif nz < z:
        df_zeroes = df_zeroes.sample(n = nz)
else:
    print('Data set is balanced')

In [15]:
df_zeroes.shape

(8958, 31)

In [16]:
df_non_zeroes.shape

(8958, 31)

In [17]:
# Merge balanced dataframes to create a data set

claimed_amount_data = pd.concat([df_non_zeroes,df_zeroes], ignore_index=True)

claimed_amount_data = claimed_amount_data.drop('Row_ID', axis=1)

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
# Split data set into training data and testing data

ca_train_set, ca_test_set = train_test_split(claimed_amount_data, test_size=0.2, random_state=42)

In [20]:
# Prepare attributes and labels for the data

ca_train_set_attributes = ca_train_set.drop('Claim_Amount', axis=1)
ca_train_set_labels = ca_train_set['Claim_Amount']

ca_train_set_attributes_prepared = full_transform.fit_transform(ca_train_set_attributes)

### 2. Performance using a single model
#### a. Linear Regression

In [21]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(ca_train_set_attributes_prepared, ca_train_set_labels)

LinearRegression()

In [22]:
# Split and prepare the data set further into training and validation sets

ca_train2_set, ca_val_set = train_test_split(ca_train_set, test_size=0.1, random_state=42)
ca_train2_set_attributes = ca_train2_set.drop('Claim_Amount', axis=1)
ca_train2_set_labels = ca_train2_set['Claim_Amount']
ca_val_set_attributes = ca_val_set.drop('Claim_Amount', axis=1)
ca_val_set_labels = ca_val_set['Claim_Amount']

ca_train2_set_all_attributes = full_transform.fit_transform(ca_train2_set_attributes)

In [23]:
# Train the Linear Regression model upon the training data

lin_reg = LinearRegression()
lin_reg.fit(ca_train2_set_all_attributes, ca_train2_set_labels)

LinearRegression()

In [24]:
ca_val_set_all_attributes = full_transform.transform(ca_val_set_attributes)

In [25]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [26]:
# Evaluate the performance of the model by calculating performance metrics

ca_val_set_predictions = lin_reg.predict(ca_val_set_all_attributes)
error = np.sqrt(mean_squared_error(ca_val_set_labels, ca_val_set_predictions))
error

326.9383410126069

##### Grid Search to fine tune model

In [27]:
grid_values = {'copy_X': [True, False], 'fit_intercept': [True, False], 'normalize': [True, False]}

In [28]:
grid_lin_reg = GridSearchCV(lin_reg, param_grid = grid_values,scoring = 'neg_root_mean_squared_error')

In [29]:
# Train the tuned model on the same data set as before

grid_lin_reg.fit(ca_train2_set_all_attributes, ca_train2_set_labels)

GridSearchCV(estimator=LinearRegression(),
             param_grid={'copy_X': [True, False],
                         'fit_intercept': [True, False],
                         'normalize': [True, False]},
             scoring='neg_root_mean_squared_error')

In [30]:
# Evaluate the performance of the tuned model by calculating performance metrics

ca_val_set_predictions = grid_lin_reg.predict(ca_val_set_all_attributes)

error = np.sqrt(mean_squared_error(ca_val_set_labels, ca_val_set_predictions))
error

326.9600882041836

RMSE has slightly improved

Reference: https://github.com/maalvarezl/MLAI-Labs

#### b. Ridge Regression

In [31]:
from sklearn.linear_model import Ridge

In [32]:
rr = Ridge()

In [33]:
# Train the Ridge regression model on the training data

rr.fit(ca_train2_set_all_attributes, ca_train2_set_labels)

Ridge()

In [34]:
pred_train_rr= rr.predict(ca_train2_set_all_attributes)

Reference: https://www.pluralsight.com/guides/linear-lasso-ridge-regression-scikit-learn

In [35]:
# Evaluate the performance of the tuned model by calculating performance metrics

error = np.sqrt(mean_squared_error(ca_train2_set_labels, pred_train_rr))
error

304.4978484304715

##### Grid Search to fine tune model

In [36]:
rr.get_params().keys()

dict_keys(['alpha', 'copy_X', 'fit_intercept', 'max_iter', 'normalize', 'random_state', 'solver', 'tol'])

In [37]:
param_grid_rr = {'alpha': [0.01,0.1,1], 'copy_X': [True, False], 'fit_intercept': [True, False],
                 'normalize': [True, False], 'random_state': [0,42], 
                 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}
grid_rr = GridSearchCV(Ridge(), param_grid=param_grid_rr, scoring='neg_root_mean_squared_error')

In [38]:
grid_rr.fit(ca_train2_set_all_attributes, ca_train2_set_labels)

GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': [0.01, 0.1, 1], 'copy_X': [True, False],
                         'fit_intercept': [True, False],
                         'normalize': [True, False], 'random_state': [0, 42],
                         'solver': ['auto', 'svd', 'cholesky', 'lsqr',
                                    'sparse_cg', 'sag', 'saga']},
             scoring='neg_root_mean_squared_error')

In [39]:
rr_mod = Ridge(alpha=grid_rr.best_params_['alpha'],
              copy_X=grid_rr.best_params_['copy_X'],
              fit_intercept=grid_rr.best_params_['fit_intercept'],
              normalize=grid_rr.best_params_['normalize'],
              random_state=grid_rr.best_params_['random_state'],
              solver=grid_rr.best_params_['solver'])

In [40]:
# Train the tuned model on the same dataset

rr_mod.fit(ca_train2_set_all_attributes, ca_train2_set_labels)

Ridge(alpha=1, normalize=True, random_state=0, solver='lsqr')

In [41]:
pred_train_rr_mod = rr_mod.predict(ca_train2_set_all_attributes)

In [42]:
## Computing the RMSE for the validation dataset
error_mod_rr = np.sqrt(mean_squared_error(ca_train2_set_labels, pred_train_rr_mod))
print('The RMSE on the validation data is :',error_mod_rr)

The RMSE on the validation data is : 304.77584434780584


#### c. Random Forest

In [43]:
from sklearn.ensemble import RandomForestRegressor

In [44]:
rfr = RandomForestRegressor()

In [45]:
rfr.get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [46]:
ca_train2_set_attributes_Question2c = full_transform.fit_transform(ca_train2_set_attributes)

In [47]:
ca_val_set_attributes_Question2c = full_transform.transform(ca_val_set_attributes)

In [48]:
whole_train_set_attributes_Question2c = np.vstack((ca_train2_set_attributes_Question2c , ca_val_set_attributes_Question2c))
whole_train_set_labels_Question2c = np.hstack((ca_train2_set_labels, ca_val_set_labels))

##### Grid Search to fine tune model

In [49]:
# Train the tuned Random Forest regressor on the training data

n_estimators_Question2c = [20, 50, 100, 200]
max_samples_Question2c = [500, 1000, 2000, 3000]
param_grid_Question2c = dict(n_estimators = n_estimators_Question2c, max_samples = max_samples_Question2c)
grid_regression_Question2c = GridSearchCV(RandomForestRegressor(), param_grid=param_grid_Question2c, scoring='neg_mean_squared_error')
grid_regression_Question2c.fit(whole_train_set_attributes_Question2c, whole_train_set_labels_Question2c)

GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'max_samples': [500, 1000, 2000, 3000],
                         'n_estimators': [20, 50, 100, 200]},
             scoring='neg_mean_squared_error')

In [50]:
regr_Question2c = RandomForestRegressor(n_estimators=grid_regression_Question2c.best_params_["n_estimators"],max_samples=grid_regression_Question2c.best_params_["max_samples"])
regr_Question2c.fit(ca_train2_set_attributes_Question2c, ca_train2_set_labels)
ca_val_set_predictions_Question2c = regr_Question2c.predict(ca_val_set_attributes_Question2c)


## Computing the RMSE for the validation dataset
error_mod_Question2c = np.sqrt(mean_squared_error(ca_val_set_labels, ca_val_set_predictions_Question2c))
print('The RMSE on the validation data is :',error_mod_Question2c)

The RMSE on the validation data is : 331.50269850701153


#### d. Gradient Boosting Regressor

In [51]:
from sklearn.ensemble import GradientBoostingRegressor

In [52]:
gbr = GradientBoostingRegressor()

In [53]:
gbr.get_params().keys()

dict_keys(['alpha', 'ccp_alpha', 'criterion', 'init', 'learning_rate', 'loss', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_iter_no_change', 'presort', 'random_state', 'subsample', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

In [54]:
## Train the Gradient Boosting Regressor on the train set
ca_train2_set_attributes_Question2d = full_transform.fit_transform(ca_train2_set_attributes)
## transform in the validation set
ca_val_set_attributes_Question2d = full_transform.transform(ca_val_set_attributes)

## Concatenating the attributes and the labels
whole_train_set_attributes_Question2d = np.vstack((ca_train2_set_attributes_Question2d , ca_val_set_attributes_Question2d))
whole_train_set_labels_Question2d = np.hstack((ca_train2_set_labels, ca_val_set_labels))

## Applying the Gradient Boosting for regression and exploring different maximum depth options
# For simplicity, we set those values for the parameters n_estimators.
n_estimators_Question2d = [20, 50, 100, 200]
param_grid_Question2d = dict(n_estimators = n_estimators_Question2d)
grid_regression_Question2d = GridSearchCV(GradientBoostingRegressor(), param_grid=param_grid_Question2d, scoring='neg_mean_squared_error')
grid_regression_Question2d.fit(whole_train_set_attributes_Question2d, whole_train_set_labels_Question2d)

## Training a Gradient Boosting using the best value for the n_estimators
regr_Question2d = GradientBoostingRegressor(n_estimators=grid_regression_Question2d.best_params_["n_estimators"])
regr_Question2d.fit(ca_train2_set_attributes_Question2d, ca_train2_set_labels)
ca_val_set_predictions_Question2d = regr_Question2d.predict(ca_val_set_attributes_Question2d)

## Computing the RMSE for the validation dataset
error_mod_Question2d = np.sqrt(mean_squared_error(ca_val_set_labels, ca_val_set_predictions_Question2d))
print('The RMSE on the validation data is :',error_mod_Question2d)

The RMSE on the validation data is : 326.9612230549615


### 3. Performance using a combination of 2 models

#### a. Binary Classifier

In [55]:
claimed_amount_data.shape

(17916, 30)

In [56]:
# preparing data for binary classifier

claimed_amount_data_class = claimed_amount_data.copy()

claimed_amount_data_class['Claim_Amount'] = (claimed_amount_data_class['Claim_Amount'] != 0).astype('float64')

In [57]:
# Split the data into training and testing datasets

ca_train_set_clf, ca_test_set_clf = train_test_split(claimed_amount_data_class, test_size=0.15, random_state=42)

In [58]:
# Set the attributes and labels of the training dataset

ca_train_set_clf_attributes = ca_train_set_clf.drop('Claim_Amount', axis=1)
ca_train_set_clf_labels = ca_train_set_clf['Claim_Amount']

ca_train_set_clf_attributes_prepared = full_transform.fit_transform(ca_train_set_clf_attributes)

In [59]:
# Set the attributes and labels of the testing dataset

ca_test_set_clf_attributes = ca_test_set_clf.drop('Claim_Amount', axis=1)
ca_test_set_clf_labels = ca_test_set_clf['Claim_Amount']

ca_test_set_clf_attributes_prepared = full_transform.transform(ca_test_set_clf_attributes)

In [60]:
# Split the training data further into training and validation sets and prepare the data

ca_train2_set_clf, ca_val_set_clf = train_test_split(ca_train_set_clf, test_size=0.15, random_state=42)
ca_train2_set_clf_attributes = ca_train2_set_clf.drop('Claim_Amount', axis=1)
ca_train2_set_clf_labels = ca_train2_set_clf['Claim_Amount']
ca_val_set_clf_attributes = ca_val_set_clf.drop('Claim_Amount', axis=1)
ca_val_set_clf_labels = ca_val_set_clf['Claim_Amount']

ca_train2_set_clf_all_attributes = full_transform.fit_transform(ca_train2_set_clf_attributes)

ca_val_set_clf_all_attributes = full_transform.transform(ca_val_set_clf_attributes)

##### Random Forest Classifier

In [61]:
from sklearn.ensemble import RandomForestClassifier

In [62]:
# Train the Random Forest Classifier on the training data

clf = RandomForestClassifier()
clf.fit(ca_train2_set_clf_all_attributes, ca_train2_set_clf_labels)

RandomForestClassifier()

In [63]:
from sklearn.metrics import log_loss, accuracy_score, f1_score

In [64]:
# Evaluate the performance of the model

ca_val_set_clf_predictions = clf.predict(ca_val_set_clf_all_attributes)
error = log_loss(ca_val_set_clf_labels, ca_val_set_clf_predictions)
acc = accuracy_score(ca_val_set_clf_labels, ca_val_set_clf_predictions)
f1 = f1_score(ca_val_set_clf_labels, ca_val_set_clf_predictions)
print(error)
print(acc)
print(f1)

15.946975614900749
0.5382932166301969
0.5283862315601252


##### Gradient Boosting Classifier

In [65]:
from sklearn.ensemble import GradientBoostingClassifier

In [66]:
# Train the Gradient Boosting Classifier on the data set

clf = GradientBoostingClassifier()
clf.fit(ca_train2_set_clf_all_attributes, ca_train2_set_clf_labels)

GradientBoostingClassifier()

In [67]:
# Evaluate the performance of the classifier

ca_val_set_clf_predictions = clf.predict(ca_val_set_clf_all_attributes)
error = log_loss(ca_val_set_clf_labels, ca_val_set_clf_predictions)
acc = accuracy_score(ca_val_set_clf_labels, ca_val_set_clf_predictions)
f1 = f1_score(ca_val_set_clf_labels, ca_val_set_clf_predictions)
print(error)
print(acc)
print(f1)

15.372575664043442
0.5549234135667396
0.5375170532060027


Gradient Boosting classifier gave better results

#### b. Regressor

In [68]:
# Prepare the dataset for Regressor where the predicted 'Claim_Amount' is non zero 

claimed_amount_all_attributes = claimed_amount_data.drop('Claim_Amount', axis = 1)

claimed_amount_data_attributes_transformed = full_transform.fit_transform(claimed_amount_all_attributes)

claimed_amount_all_labels = claimed_amount_data['Claim_Amount']

In [69]:
claimed_amount_data_nz = claimed_amount_data_attributes_transformed[claimed_amount_data['Claim_Amount'] != 0]
claimed_amount_labels_nz = claimed_amount_all_labels[claimed_amount_data['Claim_Amount'] != 0]

In [70]:
# Split the dataset into training and test data

ca_train_set_regr, ca_test_set_regr, ca_train_set_regr_labels, ca_test_set_regr_labels = train_test_split(claimed_amount_data_nz, claimed_amount_labels_nz, test_size=0.15, random_state=42)

# Split the training data further into training and validation data

ca_train2_set_regr, ca_val_set_regr, ca_train2_set_regr_labels, ca_val_set_regr_labels = train_test_split(ca_train_set_regr, ca_train_set_regr_labels, test_size=0.15, random_state=42)

In [71]:
ca_test_set_regr.shape

(1344, 79)

##### Linear Regression

In [72]:
# Train the linear regressor model on training data

from sklearn.linear_model import LinearRegression
regr = LinearRegression()
regr.fit(ca_train2_set_regr, ca_train2_set_regr_labels)

LinearRegression()

In [73]:
# Evaluate the performance on validation data

ca_val_set_regr_predictions = regr.predict(ca_val_set_regr)
error = np.sqrt(mean_squared_error(ca_val_set_regr_labels, ca_val_set_regr_predictions))
error

394.06645063665206

##### Ridge Regression

In [74]:
# Train the Ridge regressor model on training data

from sklearn.linear_model import Ridge
regr = Ridge()
regr.fit(ca_train2_set_regr, ca_train2_set_regr_labels)

Ridge()

In [75]:
ca_val_set_regr_predictions.shape

(1143,)

In [76]:
# Evaluate the performance on validation data

ca_val_set_regr_predictions = regr.predict(ca_val_set_regr)
error = np.sqrt(mean_squared_error(ca_val_set_regr_labels, ca_val_set_regr_predictions))
error

393.96644525789264

##### Random Forest for Regression

In [77]:
# Train the linear regressor model on training data

from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor()
regr.fit(ca_train2_set_regr, ca_train2_set_regr_labels)

RandomForestRegressor()

In [78]:
# Evaluate the performance on validation data

ca_val_set_regr_predictions = regr.predict(ca_val_set_regr)
error = np.sqrt(mean_squared_error(ca_val_set_regr_labels, ca_val_set_regr_predictions))
error

411.34022978175216

##### Gradient tree boosting for regression

In [79]:
# Train the linear regressor model on training data

from sklearn.ensemble import GradientBoostingRegressor
regr = GradientBoostingRegressor()
regr.fit(ca_train2_set_regr, ca_train2_set_regr_labels)

GradientBoostingRegressor()

In [80]:
# Evaluate the performance on validation data

ca_val_set_regr_predictions = regr.predict(ca_val_set_regr)
error = np.sqrt(mean_squared_error(ca_val_set_regr_labels, ca_val_set_regr_predictions))
error

398.0699115184063

Ridge Regression gave slighty better results

Reference: https://github.com/maalvarezl/MLAI-Labs

#### c. Tandem Model

In [81]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Ridge

In [82]:
# Create a prediction model based on two separate models in tandem 

clf = GradientBoostingClassifier()
clf.fit(ca_train2_set_clf_all_attributes, ca_train2_set_clf_labels)

regr = Ridge()
regr.fit(ca_train2_set_regr, ca_train2_set_regr_labels)

pred_list = list()

for data in ca_test_set_regr:
    prediction = clf.predict(data.reshape(1,-1))
    if prediction==1:
        tandem_prediction = regr.predict(data.reshape(1,-1))
    else:
        tandem_prediction = prediction
    pred_list.append(tandem_prediction)

In [83]:
len(pred_list)

1344

### 4. Performance of tandem model over test set

In [84]:
ca_val_set_regr.shape

(1143, 79)

In [85]:
ca_test_set_predictions = regr.predict(ca_test_set_regr)

In [86]:
ca_test_set_predictions.shape

(1344,)

In [87]:
## Computing the RMSE for the test dataset
error_rr = np.sqrt(mean_squared_error(ca_test_set_regr_labels, ca_test_set_predictions))
print('The RMSE on the validation data is :',error_rr)

The RMSE on the validation data is : 433.61125762424405


##### RMSE of tandem model

In [88]:
error = np.sqrt(mean_squared_error(ca_test_set_regr_labels, pred_list))
error

462.9773798100883

In [89]:
ca_train_set_attributes_prepared = full_transform.fit_transform(ca_train_set_attributes)

### 5. Observations

1. Ridge regerssor performed the best in regression. Gradient Boost Classifier is the best in classifiers. But the RMSE of tandem model is higher than that of a single model. It is because of the cumulated error of both the classifier and the regressor.


2. Out of the 30000 initial records 21000 had claim amount 0. Even after balancing 50% of them were non zero random numbers but 50% values are straight away 0. The error might be more due to models biasing more towards 0.

### 6. My Insurance Claim Predictor Function

In [92]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


def my_insurance_claim_predictor(Xtest=''):
    train_df = pd.read_csv('./data/train.csv')
    print(train_df.columns)
    train_df = train_df.fillna('?')
    train_df = train_df.drop(columns = ['Cat2', 'Cat4', 'Cat5', 'Cat7'])
    train_df = train_df[train_df['Cat1'] != '?']
    train_df = train_df[train_df['Cat3'] != '?']
    train_df = train_df[train_df['Cat6'] != '?']
    train_df = train_df[train_df['Cat8'] != '?']
    train_df = train_df[train_df['Cat9'] != '?']
    train_df = train_df[train_df['Cat10'] != '?']
    train_df = train_df[train_df['Cat11'] != '?']
    train_df = train_df[train_df['Cat12'] != '?']
    train_df = train_df[train_df['Blind_Make'] != '?']
    train_df = train_df[train_df['Blind_Model'] != '?']
    train_df = train_df[train_df['Blind_Submodel'] != '?']
    train_df = train_df[train_df['OrdCat'] != '?']
    train_df = train_df.astype({'Household_ID': 'float64', 'Vehicle': 'float64','Calendar_Year': 'float64',
                                'Model_Year': 'float64', 'OrdCat': 'float64',})
    df_zeroes = train_df[train_df['Claim_Amount'] == 0]
    df_non_zeroes = train_df[train_df['Claim_Amount'] != 0]

    nz = df_non_zeroes['Claim_Amount'].count()

    z = df_zeroes['Claim_Amount'].count()

    if nz != z:
        if nz > z:
            df_non_zeroes = df_non_zeroes.sample(n = z)
        elif nz < z:
            df_zeroes = df_zeroes.sample(n = nz)
    else:
        print('Data set is balanced')
        
    claimed_amount_data = pd.concat([df_non_zeroes,df_zeroes], ignore_index=True)
    claimed_amount_data = claimed_amount_data.drop('Row_ID', axis=1)
    ca_train_set, ca_test_set = train_test_split(claimed_amount_data, test_size=0.2, random_state=42)
    ca_train_set_attributes = ca_train_set.drop('Claim_Amount', axis=1)
    
    ca_train_set_labels = ca_train_set['Claim_Amount']
    
    attributes_cat = ['Cat1', 'Cat3', 'Cat6', 'Cat8', 'Cat9', 'Cat10', 'Cat11', 'Cat12', 'OrdCat', 'NVCat']
    attributes_num = ['Household_ID', 'Vehicle', 'Calendar_Year', 'Model_Year', 'Var1',
                      'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8',
                      'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4']
    full_transform = ColumnTransformer([
    ("num", StandardScaler(), attributes_num),
    ("cat", OneHotEncoder(handle_unknown = "ignore"), attributes_cat),
    ])

    ca_train_set_attributes_prepared = full_transform.fit_transform(ca_train_set_attributes)
    rr = Ridge()
    rr.fit(ca_train_set_attributes_prepared, ca_train_set_labels)
        
    df = pd.read_csv(Xtest)
    print(df.columns)
    df = df.fillna('?')
    if 'Unnamed: 34' in df.columns:
        df = df.drop('Unnamed: 34', axis=1)
    df = df.drop(columns = ['Row_ID', 'Cat2', 'Cat4', 'Cat5', 'Cat7'])
    df = df[df['Cat1'] != '?']
    df = df[df['Cat3'] != '?']
    df = df[df['Cat6'] != '?']
    df = df[df['Cat8'] != '?']
    df = df[df['Cat9'] != '?']
    df = df[df['Cat10'] != '?']
    df = df[df['Cat11'] != '?']
    df = df[df['Cat12'] != '?']
    df = df[df['Blind_Make'] != '?']
    df = df[df['Blind_Model'] != '?']
    df = df[df['Blind_Submodel'] != '?']
    df = df[df['OrdCat'] != '?']
    df = df.astype({'Household_ID': 'float64', 'Vehicle': 'float64','Calendar_Year': 'float64', 
                    'Model_Year': 'float64', 'OrdCat': 'float64',})
    
    test_set_attributes = full_transform.transform(df)
    predictions = rr.predict(test_set_attributes)
    return predictions
    

In [93]:
pred_test = my_insurance_claim_predictor(Xtest='./data/test.csv')

Index(['Row_ID', 'Household_ID', 'Vehicle', 'Calendar_Year', 'Model_Year',
       'Blind_Make', 'Blind_Model', 'Blind_Submodel', 'Cat1', 'Cat2', 'Cat3',
       'Cat4', 'Cat5', 'Cat6', 'Cat7', 'Cat8', 'Cat9', 'Cat10', 'Cat11',
       'Cat12', 'OrdCat', 'Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6',
       'Var7', 'Var8', 'NVCat', 'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4',
       'Claim_Amount'],
      dtype='object')
Index(['Row_ID', 'Household_ID', 'Vehicle', 'Calendar_Year', 'Model_Year',
       'Blind_Make', 'Blind_Model', 'Blind_Submodel', 'Cat1', 'Cat2', 'Cat3',
       'Cat4', 'Cat5', 'Cat6', 'Cat7', 'Cat8', 'Cat9', 'Cat10', 'Cat11',
       'Cat12', 'OrdCat', 'Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6',
       'Var7', 'Var8', 'NVCat', 'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4',
       'Unnamed: 34'],
      dtype='object')


  res_values = method(rvalues)


In [None]:
pred_test.shape