# Case Study: Modeling Car Insurance Claims

### Tasks
- Identify the single feature of the data that is the best predictor of whether a customer will put in a claim (the `outcome` column), excluding the `id` column.
- Store as a DataFrame called `best_feature_df`, containing columns named `best_feature` and `best_accuracy` with the name of the feature with the highest accuracy, and the respective accuracy score.

In [27]:
import pandas as pd
import numpy as np 
from statsmodels.formula.api import logit
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
# Getting an idea of the contents of car_insurance
car_insurance = pd.read_csv('../Data/car_insurance.csv')
car_insurance.head()

Unnamed: 0,id,age,gender,driving_experience,education,income,credit_score,vehicle_ownership,vehicle_year,married,children,postal_code,annual_mileage,vehicle_type,speeding_violations,duis,past_accidents,outcome
0,569520,3,0,0-9y,high school,upper class,0.629027,1.0,after 2015,0.0,1.0,10238,12000.0,sedan,0,0,0,0.0
1,750365,0,1,0-9y,none,poverty,0.357757,0.0,before 2015,0.0,0.0,10238,16000.0,sedan,0,0,0,1.0
2,199901,0,0,0-9y,high school,working class,0.493146,1.0,before 2015,0.0,0.0,10238,11000.0,sedan,0,0,0,0.0
3,478866,0,1,0-9y,university,working class,0.206013,1.0,before 2015,0.0,1.0,32765,11000.0,sedan,0,0,0,0.0
4,731664,1,1,10-19y,none,working class,0.388366,1.0,before 2015,0.0,0.0,32765,12000.0,sedan,2,0,1,1.0


In [None]:
# Check column data types and see if any of them are null
# since null values can affect model fit
car_insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   10000 non-null  int64  
 1   age                  10000 non-null  int64  
 2   gender               10000 non-null  int64  
 3   driving_experience   10000 non-null  object 
 4   education            10000 non-null  object 
 5   income               10000 non-null  object 
 6   credit_score         9018 non-null   float64
 7   vehicle_ownership    10000 non-null  float64
 8   vehicle_year         10000 non-null  object 
 9   married              10000 non-null  float64
 10  children             10000 non-null  float64
 11  postal_code          10000 non-null  int64  
 12  annual_mileage       9043 non-null   float64
 13  vehicle_type         10000 non-null  object 
 14  speeding_violations  10000 non-null  int64  
 15  duis                 10000 non-null  

In [29]:
# Imputation for 'annual_mileage' and 'credit_score' using median values
car_insurance['annual_mileage'] = car_insurance['annual_mileage'].fillna(
    car_insurance['annual_mileage'].median()
)
car_insurance['credit_score'] = car_insurance['credit_score'].fillna(
    car_insurance['credit_score'].median()
)

In [30]:
# Creating a list of predictors
# 'id' column removed since it is not a predictor
# and 'outcome' is the literal outcome of the data
predictors_list = car_insurance.columns.tolist()
predictors_list.remove('id')
predictors_list.remove('outcome')
predictors_list

['age',
 'gender',
 'driving_experience',
 'education',
 'income',
 'credit_score',
 'vehicle_ownership',
 'vehicle_year',
 'married',
 'children',
 'postal_code',
 'annual_mileage',
 'vehicle_type',
 'speeding_violations',
 'duis',
 'past_accidents']

In [31]:
len(predictors_list)

16

In [None]:
# Creating a dictionary of models for each predictor variable
# This is easier than typing out all 16 of them
models_dict = {}

for pred in predictors_list:
    models_dict[f'{pred}_mdl'] = logit(f'outcome ~ {pred}', data=car_insurance).fit()


Optimization terminated successfully.
         Current function value: 0.511794
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.615951
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.467092
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.603742
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.531499
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.572649
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.552412
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.572668
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.586659
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.595431
  

In [None]:
# Confusion Matrix Printing 

# Get the actual outcome values first
actual_outcome = car_insurance['outcome']

# List comprehension that returns a list of series/arrays
# representing the predicted values of each model
mdl_pred_list = [np.round(models_dict.get(mdl).predict()) for mdl in list(models_dict.keys())]

# Printing the confusion matrices for each model
for var, mdl in zip(predictors_list, mdl_pred_list):
    print(f'Confusion Matrix for {var}:\n')

    outcome_df = pd.DataFrame({
        'outcome_actual':actual_outcome,
        'outcome_predict': mdl
    })

    print(outcome_df.value_counts(sort=False))
    print('\n\n')

Confusion Matrix for age:

outcome_actual  outcome_predict
0.0             0.0                6299
                1.0                 568
1.0             0.0                1685
                1.0                1448
Name: count, dtype: int64



Confusion Matrix for gender:

outcome_actual  outcome_predict
0.0             0.0                6867
1.0             0.0                3133
Name: count, dtype: int64



Confusion Matrix for driving_experience:

outcome_actual  outcome_predict
0.0             0.0                5554
                1.0                1313
1.0             0.0                 916
                1.0                2217
Name: count, dtype: int64



Confusion Matrix for education:

outcome_actual  outcome_predict
0.0             0.0                6867
1.0             0.0                3133
Name: count, dtype: int64



Confusion Matrix for income:

outcome_actual  outcome_predict
0.0             0.0                6239
                1.0                 628
1.

In [34]:
len(mdl_pred_list)

16

In [None]:
mdl_accuracies_dict = {}

# For-loop to iterate over the model dictionary 
# and calculate each one's accuracy 
for mdl_name, mdl in models_dict.items():

    conf_matrix = mdl.pred_table()
    tn = conf_matrix[0, 0]
    fp = conf_matrix[0, 1]
    fn = conf_matrix[1, 0]
    tp = conf_matrix[1, 1]

    accuracy = ( (tn + tp) / (tn + fp + fn + tp) )
    print(f'{mdl_name} accuracy : {accuracy}')

    # Accuracies are also appeneded to a dictionary for easier access
    mdl_accuracies_dict[mdl_name] = accuracy


age_mdl accuracy : 0.7747
gender_mdl accuracy : 0.6867
driving_experience_mdl accuracy : 0.7771
education_mdl accuracy : 0.6867
income_mdl accuracy : 0.7425
credit_score_mdl accuracy : 0.7053
vehicle_ownership_mdl accuracy : 0.7351
vehicle_year_mdl accuracy : 0.6867
married_mdl accuracy : 0.6867
children_mdl accuracy : 0.6867
postal_code_mdl accuracy : 0.6867
annual_mileage_mdl accuracy : 0.6904
vehicle_type_mdl accuracy : 0.6867
speeding_violations_mdl accuracy : 0.6867
duis_mdl accuracy : 0.6867
past_accidents_mdl accuracy : 0.6867


In [None]:
# Get the feature/predictor variable with the highest accuracy value
best_feature = max(mdl_accuracies_dict, key=mdl_accuracies_dict.get)
best_feature_accuracy = mdl_accuracies_dict.get(best_feature)

print(best_feature)
print(best_feature_accuracy)

driving_experience_mdl
0.7771


In [None]:
# Stores the feature and accuracy in a dataFrame as per answer format
best_feature_df = pd.DataFrame({
    'best_feature':['best_feature'],
    'best_accuracy':[best_feature_accuracy]
})

best_feature_df

Unnamed: 0,best_feature,best_accuracy
0,best_feature,0.7771
