Hyperparameter tuning to be carried out on best performing models.

Based on experimentation already carried out, Decision Tree and Random Forest using only Agency & WorkflowStatus on oversampled dataset at training test split 80/20 will be used.

Decision Tree and Random Forest using only Agency, WorkflowStatus & UWDecision on oversampled dataset at training test split 80/20 will also be used.

Decision Tree and Random Forest using only Agency, WorkflowStatus, UWDecision & CommDateProvided on oversampled dataset at training test split 80/20 will also be used.

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV

import lightgbm as lgb

from imblearn.over_sampling import SMOTE

from keras.models import Sequential
from keras.layers import Dense

import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
df = pd.read_excel('Dataset.xlsx')

## Clean Data

In [3]:
# Get the list of columns to convert to categorical
categorical_columns = df.select_dtypes(include='int64').columns.tolist()

# Convert the selected columns to categorical
df[categorical_columns] = df[categorical_columns].astype('category')

numeric_data = df.select_dtypes(include=[np.number])

# Calculate MAD for each column
mad = numeric_data.mad()

# Choose a threshold multiplier
k = 3

# Calculate the threshold value
threshold = k * mad

# Identify outliers
outliers = (np.abs(numeric_data - numeric_data.median()) > threshold)

# Apply logarithm to the specified columns
outlier_columns = ['CommissionSacrificePercentage', 'BonusCommissionPercentage']
for column in outlier_columns:
    df[column] = np.log1p(df[column])
    
df = df.drop(columns=['PropDate'])


In [None]:
# Select specific columns for each iteration
feature_sets = [
    ['Agency', 'WorkflowStatus'],
    ['Agency', 'WorkflowStatus', 'UWDecision'],
    ['Agency', 'WorkflowStatus', 'UWDecision', 'CommDateProvided']
]

# List of scoring metrics
scoring_metrics = ['roc_auc', 'accuracy']

# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

### Decision Tree

In [5]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)
    
    for scoring_metric in scoring_metrics:
        for num_folds in num_folds_list:
            print(f"Scoring Metric: {scoring_metric}, Number of Folds: {num_folds}")
            
            # Define the parameter grid to search
            param_grid = {
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['auto', 'sqrt', 'log2'],
                'criterion': ['gini', 'entropy']
            }

            # Initialize GridSearchCV
            grid_search = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                                       param_grid=param_grid,
                                       scoring=scoring_metric,
                                       cv=num_folds,
                                       n_jobs=-1)

            # Fit the grid search to your data
            grid_search.fit(X_resampled, y_resampled)

            # Print the best hyperparameters and corresponding score
            print("Best Hyperparameters:", grid_search.best_params_)
            print(f"Best {scoring_metric} Score:", grid_search.best_score_)

            # Retrieve the best model with the best hyperparameters
            best_model = grid_search.best_estimator_

            # Train the best model on the entire resampled training data
            best_model.fit(X_resampled, y_resampled)

            # Predict on the test data
            y_pred_test = best_model.predict(X_test_scaled)

            # Calculate Training and Test scores
            train_score = grid_search.best_score_  # Use the best_score_ attribute from GridSearchCV
            test_score = grid_search.score(X_test_scaled, y_test)  # Score on the test set

            # Calculate Training AUC and Test AUC
            train_auc = roc_auc_score(y_resampled, best_model.predict(X_resampled))
            test_auc = roc_auc_score(y_test, y_pred_test)

            # Calculate Training Accuracy and Test Accuracy
            train_accuracy = accuracy_score(y_resampled, best_model.predict(X_resampled))
            test_accuracy = accuracy_score(y_test, y_pred_test)

            # Print evaluation metrics for training and test data
            print("Training AUC:", train_auc)
            print("Test AUC:", test_auc)
            print("Training Accuracy:", train_accuracy)
            print("Test Accuracy:", test_accuracy)
            print("---------------------------------------------")


Iteration 1: Using features Agency, WorkflowStatus
Scoring Metric: roc_auc, Number of Folds: 5
Best Hyperparameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5}
Best roc_auc Score: 0.8237580600642339
Training AUC: 0.7726353628398113
Test AUC: 0.7495739140658757
Training Accuracy: 0.7726353628398113
Test Accuracy: 0.725463389819731
---------------------------------------------
Scoring Metric: roc_auc, Number of Folds: 10
Best Hyperparameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5}
Best roc_auc Score: 0.8247645099816043
Training AUC: 0.7726353628398113
Test AUC: 0.7495739140658757
Training Accuracy: 0.7726353628398113
Test Accuracy: 0.725463389819731
---------------------------------------------
Scoring Metric: roc_auc, Number of Folds: 20
Best Hyperparameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, '

### Random Forest

In [8]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)
    
    for scoring_metric in scoring_metrics:
        for num_folds in num_folds_list:
            print(f"Scoring Metric: {scoring_metric}, Number of Folds: {num_folds}")
            
            # Define the parameter grid to search
            param_grid = {
                'n_estimators': [25, 50, 100],  # Number of trees in the forest
                'bootstrap': [True, False]
            }

            # Initialise GridSearchCV
            grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                                       param_grid=param_grid,
                                       scoring=scoring_metric,
                                       cv=num_folds,
                                       n_jobs=-1)

            # Fit the grid search to your data
            grid_search.fit(X_resampled, y_resampled)

            # Print the best hyperparameters and corresponding score
            print("Best Hyperparameters:", grid_search.best_params_)
            print(f"Best {scoring_metric} Score:", grid_search.best_score_)

            # Retrieve the best model with the best hyperparameters
            best_model = grid_search.best_estimator_

            # Train the best model on the entire resampled training data
            best_model.fit(X_resampled, y_resampled)

            # Predict on the test data
            y_pred_test = best_model.predict(X_test_scaled)

            # Calculate Training and Test scores
            train_score = grid_search.best_score_  # Use the best_score_ attribute from GridSearchCV
            test_score = grid_search.score(X_test_scaled, y_test)  # Score on the test set

            # Calculate Training AUC and Test AUC
            train_auc = roc_auc_score(y_resampled, best_model.predict(X_resampled))
            test_auc = roc_auc_score(y_test, y_pred_test)

            # Calculate Training Accuracy and Test Accuracy
            train_accuracy = accuracy_score(y_resampled, best_model.predict(X_resampled))
            test_accuracy = accuracy_score(y_test, y_pred_test)

            # Print evaluation metrics for training and test data
            print("Training AUC:", train_auc)
            print("Test AUC:", test_auc)
            print("Training Accuracy:", train_accuracy)
            print("Test Accuracy:", test_accuracy)
            print("---------------------------------------------")


Iteration 1: Using features Agency, WorkflowStatus
Scoring Metric: roc_auc, Number of Folds: 5
Best Hyperparameters: {'bootstrap': True, 'n_estimators': 100}
Best roc_auc Score: 0.8242890391853226
Training AUC: 0.7730540634382468
Test AUC: 0.7486750558343944
Training Accuracy: 0.7730540634382468
Test Accuracy: 0.7276571392236034
---------------------------------------------
Scoring Metric: roc_auc, Number of Folds: 10
Best Hyperparameters: {'bootstrap': True, 'n_estimators': 100}
Best roc_auc Score: 0.8253115048148748
Training AUC: 0.7730540634382468
Test AUC: 0.7486750558343944
Training Accuracy: 0.7730540634382468
Test Accuracy: 0.7276571392236034
---------------------------------------------
Scoring Metric: roc_auc, Number of Folds: 20
Best Hyperparameters: {'bootstrap': True, 'n_estimators': 100}
Best roc_auc Score: 0.8257919517074598
Training AUC: 0.7730540634382468
Test AUC: 0.7486750558343944
Training Accuracy: 0.7730540634382468
Test Accuracy: 0.7276571392236034
---------------

In [10]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)
    
    for scoring_metric in scoring_metrics:
        for num_folds in num_folds_list:
            print(f"Scoring Metric: {scoring_metric}, Number of Folds: {num_folds}")
            
            # Define the parameter grid to search
            param_grid = {
                'n_estimators': [100],  # Number of trees in the forest
                'max_depth': [None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1],
                'max_features': ['auto'],
                'bootstrap': [True],
                'criterion': ['gini', 'entropy']
            }

            # Initialise GridSearchCV
            grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                                       param_grid=param_grid,
                                       scoring=scoring_metric,
                                       cv=num_folds,
                                       n_jobs=-1)

            # Fit the grid search to your data
            grid_search.fit(X_resampled, y_resampled)

            # Print the best hyperparameters and corresponding score
            print("Best Hyperparameters:", grid_search.best_params_)
            print(f"Best {scoring_metric} Score:", grid_search.best_score_)

            # Retrieve the best model with the best hyperparameters
            best_model = grid_search.best_estimator_

            # Train the best model on the entire resampled training data
            best_model.fit(X_resampled, y_resampled)

            # Predict on the test data
            y_pred_test = best_model.predict(X_test_scaled)

            # Calculate Training and Test scores
            train_score = grid_search.best_score_  # Use the best_score_ attribute from GridSearchCV
            test_score = grid_search.score(X_test_scaled, y_test)  # Score on the test set

            # Calculate Training AUC and Test AUC
            train_auc = roc_auc_score(y_resampled, best_model.predict(X_resampled))
            test_auc = roc_auc_score(y_test, y_pred_test)

            # Calculate Training Accuracy and Test Accuracy
            train_accuracy = accuracy_score(y_resampled, best_model.predict(X_resampled))
            test_accuracy = accuracy_score(y_test, y_pred_test)

            # Print evaluation metrics for training and test data
            print("Training AUC:", train_auc)
            print("Test AUC:", test_auc)
            print("Training Accuracy:", train_accuracy)
            print("Test Accuracy:", test_accuracy)
            print("---------------------------------------------")


Iteration 1: Using features Agency, WorkflowStatus
Scoring Metric: roc_auc, Number of Folds: 5
Best Hyperparameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best roc_auc Score: 0.82448628233637
Training AUC: 0.7726659994689651
Test AUC: 0.7487372535418771
Training Accuracy: 0.7726659994689651
Test Accuracy: 0.7274345849362541
---------------------------------------------
Scoring Metric: roc_auc, Number of Folds: 10
Best Hyperparameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best roc_auc Score: 0.8254298841664733
Training AUC: 0.7726659994689651
Test AUC: 0.7487372535418771
Training Accuracy: 0.7726659994689651
Test Accuracy: 0.7274345849362541
---------------------------------------------
Scoring Metric: roc_auc, Number of Folds: 20
Best Hyperparameters: {'bootstr