# Classification Model: Predict Whether a Customer Will File a Claim

## Processing Data

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

df = pd.read_csv('car_insurance_claim.csv')

# Exclude non-ethical features
features_to_exclude = ["GENDER", "MSTATUS", "PARENT1", "EDUCATION"]
df.drop(columns=features_to_exclude, inplace=True)

# Missing values
for col in df.columns:
    if df[col].dtype in ['float64', 'int64']:
        df[col].fillna(df[col].median(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

# Separate target variable and features
X = df.drop('CLAIM_FLAG', axis=1)
y = df['CLAIM_FLAG']

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


## Scaling and One-hot Encoding Data

In [44]:
# One hot encode categorical variables.
# This will automatically convert categorical variables (typically object type) into dummy/indicator variables.
X = pd.get_dummies(X, drop_first=True)

# Scale numeric features.
# Identify numeric columns in X (after encoding, some may remain numeric)
num_cols = X.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

## Logistic Regression

In [45]:
# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the logistic regression model.
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

## Prediction and Results

In [46]:
# Make predictions and evaluate the model.
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7763221737020863
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.93      0.86      1506
           1       0.66      0.35      0.46       555

    accuracy                           0.78      2061
   macro avg       0.73      0.64      0.66      2061
weighted avg       0.76      0.78      0.75      2061



## Comparison to not Excluding features

In [48]:
data = pd.read_csv('car_insurance_claim.csv')

# Misisng and N/A
for column in data.columns:
    if data[column].dtype in ['float64', 'int64']:
        data[column].fillna(data[column].median(), inplace=True)
    else:
        data[column].fillna(data[column].mode()[0], inplace=True)

# Separa target variable and features
features_all = data.drop('CLAIM_FLAG', axis=1)
target_var = data['CLAIM_FLAG']

# One-hot encode
features_encoded = pd.get_dummies(features_all, drop_first=True)

# Scaling
numeric_features = features_encoded.select_dtypes(include=['float64', 'int64']).columns
scaler_std = StandardScaler()
features_encoded[numeric_features] = scaler_std.fit_transform(features_encoded[numeric_features])

# Train and Test
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target_var, test_size=0.2, random_state=42)

# logistic regression model
logit_model = LogisticRegression(max_iter=1000)
logit_model.fit(X_train, y_train)

# Evaluation
preds = logit_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))
print("Classification Report:\n", classification_report(y_test, preds))



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mode()[0], inplace=True)


Accuracy: 0.784570596797671
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.94      0.86      1506
           1       0.68      0.37      0.48       555

    accuracy                           0.78      2061
   macro avg       0.74      0.66      0.67      2061
weighted avg       0.77      0.78      0.76      2061



# Regression: How Much That Claim Will Cost

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv("car_insurance_claim.csv")

df = df.drop(columns=['GENDER', 'MSTATUS', 'PARENT1', 'EDUCATION'])

currency_cols = ['INCOME', 'HOME_VAL', 'BLUEBOOK', 'OLDCLAIM', 'CLM_AMT']
for col in currency_cols:
    df[col] = df[col].replace('[\$,]', '', regex=True)
    df[col] = pd.to_numeric(df[col], errors='coerce')


df_clean = df.dropna()

upper_bound = df_clean['CLM_AMT'].quantile(0.99)
df_clean = df_clean[df_clean['CLM_AMT'] <= upper_bound]

categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()
df_encoded = pd.get_dummies(df_clean, columns=categorical_cols, drop_first=True)


X = df_encoded.drop(columns=['CLM_AMT'])
y = df_encoded['CLM_AMT']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0]
}

xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=3,
    scoring='r2',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_


y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("XGBoost Regression Model Performance")
print("Best Parameters:", grid_search.best_params_)
print(f"MSE: {mse:,.2f}")
print(f"RMSE: {rmse:,.2f}")
print(f"R²: {r2:.2f}")


Fitting 3 folds for each of 81 candidates, totalling 243 fits
XGBoost Regression Model Performance
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'subsample': 1.0}
MSE: 1,674,781.75
RMSE: 1,294.13
R²: 0.66
