In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import pydotplus
from IPython.display import Image
import graphviz
from sklearn.tree import export_graphviz


In [3]:
# Example: Load data from a CSV file
data = pd.read_csv('insurance_data.csv')

# Features and target variable
X = data[['subscription_length', 'customer_age', 'vehicle_age', 'region_density','region_code']]  # Features

# Split customer_age into banded ranges
agebins = [0, 18, 34, 50, 65, 100]
agelabels = ['0-18', '18-34', '34-50', '50-65', '65-100']
X['age_group'] = pd.cut(X['customer_age'], bins=agebins, labels=agelabels, right=False)
X = X.drop('customer_age', axis=1)  # Features
X = pd.get_dummies(X, columns=['age_group'])

# Split subscription_length into banded ranges
subbins = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 20]
sublabels = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10', '10-11', '11-12', '12-13', '13-14', '14-20']
X['subscription_length_group'] = pd.cut(X['subscription_length'], bins=subbins, labels=sublabels, right=False)
X = X.drop('subscription_length', axis=1)  # Features
X = pd.get_dummies(X, columns=['subscription_length_group'])

# Split region_density into banded ranges
regbins = [0, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000]
reglabels = ['0-10K', '10-20K', '20-30K', '30-40K', '40-50K', '50-60K', '60-70K', '70-80K']
X['region_density_group'] = pd.cut(X['region_density'], bins=subbins, labels=sublabels, right=False)
X = X.drop('region_density', axis=1)  # Features
X = pd.get_dummies(X, columns=['region_density_group'])

# Split vehicle_age into banded ranges
X['vehicle_age_group'] = pd.cut(X['vehicle_age'], bins=subbins, labels=sublabels, right=False)
X = pd.get_dummies(X, columns=['vehicle_age_group'])
X = X.drop('vehicle_age', axis=1)  # Features

y = data['claim_status']  # Target variable
X = pd.get_dummies(X, columns=['region_code'])
X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['age_group'] = pd.cut(X['customer_age'], bins=agebins, labels=agelabels, right=False)


Unnamed: 0,age_group_0-18,age_group_18-34,age_group_34-50,age_group_50-65,age_group_65-100,subscription_length_group_0-1,subscription_length_group_1-2,subscription_length_group_2-3,subscription_length_group_3-4,subscription_length_group_4-5,...,region_code_C20,region_code_C21,region_code_C22,region_code_C3,region_code_C4,region_code_C5,region_code_C6,region_code_C7,region_code_C8,region_code_C9
0,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58587,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
58588,False,False,True,False,False,False,False,True,False,False,...,False,False,False,True,False,False,False,False,False,False
58589,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
58590,False,False,True,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False


In [4]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
# Define the parameter grid
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'criterion': ['gini', 'entropy', 'log_loss'],
    
}

In [6]:
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

In [7]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


In [8]:
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

Best hyperparameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 2}


In [9]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=1))

[[16434     0]
 [ 1144     0]]
              precision    recall  f1-score   support

           0       0.93      1.00      0.97     16434
           1       1.00      0.00      0.00      1144

    accuracy                           0.93     17578
   macro avg       0.97      0.50      0.48     17578
weighted avg       0.94      0.93      0.90     17578



In [10]:
# Feature importance
feature_importance = best_model.feature_importances_

# Create a DataFrame to store feature names and their corresponding importances
features_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

# Sort the DataFrame by feature importance in descending order
features_df = features_df.sort_values(by='Importance', ascending=False)

# Display the top 10 most important features
features_df.head(20)



Unnamed: 0,Feature,Importance
16,subscription_length_group_11-12,0.089228
5,subscription_length_group_0-1,0.078053
2,age_group_34-50,0.072818
36,vehicle_age_group_1-2,0.054169
35,vehicle_age_group_0-1,0.050215
37,vehicle_age_group_2-3,0.048432
15,subscription_length_group_10-11,0.04419
38,vehicle_age_group_3-4,0.034544
8,subscription_length_group_3-4,0.029892
6,subscription_length_group_1-2,0.029203


In [11]:


# Extract feature names
#feature_names = X.feature_names
feature_names = X.columns.tolist()

# Access one of the trees in the forest
tree = best_model

# Export the best model to a dot file
export_graphviz(best_model, out_file='best_tree.dot', feature_names=feature_names, filled=True)

# Render the dot file to a PNG image
with open("best_tree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph).render("best_tree")

'best_tree.pdf'