In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier

import lightgbm as lgb

from imblearn.over_sampling import SMOTE

from keras.models import Sequential
from keras.layers import Dense

import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
df = pd.read_excel('Dataset.xlsx')

## Clean Data

In [3]:
# Get the list of columns to convert to categorical
categorical_columns = df.select_dtypes(include='int64').columns.tolist()

# Convert the selected columns to categorical
df[categorical_columns] = df[categorical_columns].astype('category')

numeric_data = df.select_dtypes(include=[np.number])

# Calculate MAD for each column
mad = numeric_data.mad()

# Choose a threshold multiplier
k = 3

# Calculate the threshold value
threshold = k * mad

# Identify outliers
outliers = (np.abs(numeric_data - numeric_data.median()) > threshold)

# Apply logarithm to the specified columns
outlier_columns = ['CommissionSacrificePercentage', 'BonusCommissionPercentage']
for column in outlier_columns:
    df[column] = np.log1p(df[column])
    
df = df.drop(columns=['PropDate'])


# Top-n Features

## Logistic Regression

#### 80/20 Split

In [10]:
# Select specific columns for each iteration
feature_sets = [
    ['Agency', 'WorkflowStatus'],
    ['Agency', 'WorkflowStatus', 'UWDecision'],
    ['Agency', 'WorkflowStatus', 'UWDecision', 'CommDateProvided']
]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data
    model = LogisticRegression()
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Training AUC: 0.7440360695247237
Test AUC: 0.7476467049756738
Training Accuracy: 0.7440360695247238
Test Accuracy: 0.675674816392713
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus, UWDecision
Training AUC: 0.7441075549927494
Test AUC: 0.7473698238190174
Training Accuracy: 0.7441075549927493
Test Accuracy: 0.6756430229230916
---------------------------------------------
Iteration 3: Using features Agency, WorkflowStatus, UWDecision, CommDateProvided
Training AUC: 0.7435509895631217
Test AUC: 0.7468982203651653
Training Accuracy: 0.7435509895631217
Test Accuracy: 0.6755476425142276
---------------------------------------------


#### Cross Validation

In [8]:

# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_auc_scores = []
        fold_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data
            model = LogisticRegression()
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)

            # Calculate AUC and accuracy for the fold
            fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
            fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

            fold_auc_scores.append(fold_auc)
            fold_accuracy_scores.append(fold_accuracy)

        # Calculate average AUC and accuracy for the current number of folds
        avg_auc = sum(fold_auc_scores) / num_folds
        avg_accuracy = sum(fold_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average AUC:", avg_auc)
        print("Average Accuracy:", avg_accuracy)
        print("---------------------------------------------")
        
        # Train the model on the entire training data
        model.fit(X_resampled, y_resampled)

        # Predict on the test data
        y_pred = model.predict(X_test_scaled)

        # Calculate Test AUC and Test Accuracy
        test_auc = roc_auc_score(y_test, y_pred)
        test_accuracy = accuracy_score(y_test, y_pred)

        # Print evaluation metrics for the test data
        print("Test AUC:", test_auc)
        print("Test Accuracy:", test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Number of Folds: 5
Average AUC: 0.7440473444484998
Average Accuracy: 0.744036074295407
---------------------------------------------
Test AUC: 0.7476467049756738
Test Accuracy: 0.675674816392713
---------------------------------------------
Number of Folds: 10
Average AUC: 0.7440431490419122
Average Accuracy: 0.7440360744225357
---------------------------------------------
Test AUC: 0.7476467049756738
Test Accuracy: 0.675674816392713
---------------------------------------------
Number of Folds: 20
Average AUC: 0.7440439397237586
Average Accuracy: 0.7440361328144551
---------------------------------------------
Test AUC: 0.7476467049756738
Test Accuracy: 0.675674816392713
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus, UWDecision
Number of Folds: 5
Average AUC: 0.7441189985384835
Average Accuracy: 0.7441075600502398
---------------------------------------------
Test AUC: 0.7473698238190

#### 70/30 Split

In [12]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data
    model = LogisticRegression()
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Training AUC: 0.745384094675661
Test AUC: 0.7467092470272587
Training Accuracy: 0.7453840946756611
Test Accuracy: 0.6749613175353442
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus, UWDecision
Training AUC: 0.7454016012698117
Test AUC: 0.746476850042306
Training Accuracy: 0.7454016012698116
Test Accuracy: 0.6749189257932555
---------------------------------------------
Iteration 3: Using features Agency, WorkflowStatus, UWDecision, CommDateProvided
Training AUC: 0.7448472257883803
Test AUC: 0.7460120560724005
Training Accuracy: 0.7448472257883803
Test Accuracy: 0.6748341423090782
---------------------------------------------


#### Cross Validation

In [13]:
# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_auc_scores = []
        fold_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data
            model = LogisticRegression()
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)

            # Calculate AUC and accuracy for the fold
            fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
            fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

            fold_auc_scores.append(fold_auc)
            fold_accuracy_scores.append(fold_accuracy)

        # Calculate average AUC and accuracy for the current number of folds
        avg_auc = sum(fold_auc_scores) / num_folds
        avg_accuracy = sum(fold_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average AUC:", avg_auc)
        print("Average Accuracy:", avg_accuracy)
        print("---------------------------------------------")
        
        # Train the model on the entire training data
        model.fit(X_resampled, y_resampled)

        # Predict on the test data
        y_pred = model.predict(X_test_scaled)

        # Calculate Test AUC and Test Accuracy
        test_auc = roc_auc_score(y_test, y_pred)
        test_accuracy = accuracy_score(y_test, y_pred)

        # Print evaluation metrics for the test data
        print("Test AUC:", test_auc)
        print("Test Accuracy:", test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Number of Folds: 5
Average AUC: 0.745363738921695
Average Accuracy: 0.7453665967306013
---------------------------------------------
Test AUC: 0.7467092470272587
Test Accuracy: 0.6749613175353442
---------------------------------------------
Number of Folds: 10
Average AUC: 0.7453655051333681
Average Accuracy: 0.7453666171590109
---------------------------------------------
Test AUC: 0.7467092470272587
Test Accuracy: 0.6749613175353442
---------------------------------------------
Number of Folds: 20
Average AUC: 0.7453678632516977
Average Accuracy: 0.7453782656720058
---------------------------------------------
Test AUC: 0.7467092470272587
Test Accuracy: 0.6749613175353442
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus, UWDecision
Number of Folds: 5
Average AUC: 0.7453988024330469
Average Accuracy: 0.7454016098848529
---------------------------------------------
Test AUC: 0.7464768500

### Stochastic Gradient Descent

#### 80/20 Split

In [14]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using SGDClassifier
    model = SGDClassifier(loss='log', random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Training AUC: 0.7440360695247237
Test AUC: 0.7476467049756738
Training Accuracy: 0.7440360695247238
Test Accuracy: 0.675674816392713
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus, UWDecision
Training AUC: 0.7440973427830315
Test AUC: 0.7472981083858563
Training Accuracy: 0.7440973427830314
Test Accuracy: 0.6756112294534703
---------------------------------------------
Iteration 3: Using features Agency, WorkflowStatus, UWDecision, CommDateProvided
Training AUC: 0.7437245971283266
Test AUC: 0.74718508209781
Training Accuracy: 0.7437245971283266
Test Accuracy: 0.675674816392713
---------------------------------------------


#### Cross Validation

In [18]:
# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using SGDClassifier
    model = SGDClassifier(loss='log', random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")
    
    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_train_auc_scores = []
        fold_test_auc_scores = []
        fold_train_accuracy_scores = []
        fold_test_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data using SGDClassifier
            model = SGDClassifier(loss='log', random_state=42)
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)
            y_fold_test_pred = model.predict(X_test_scaled)

            # Calculate Training AUC, Test AUC, Training Accuracy, and Test Accuracy for the fold
            fold_train_auc = roc_auc_score(y_fold_train, model.predict(X_fold_train))
            fold_test_auc = roc_auc_score(y_test, y_fold_test_pred)
            fold_train_accuracy = accuracy_score(y_fold_train, model.predict(X_fold_train))
            fold_test_accuracy = accuracy_score(y_test, y_fold_test_pred)

            fold_train_auc_scores.append(fold_train_auc)
            fold_test_auc_scores.append(fold_test_auc)
            fold_train_accuracy_scores.append(fold_train_accuracy)
            fold_test_accuracy_scores.append(fold_test_accuracy)

        # Calculate average metrics for the current iteration's folds
        avg_train_auc = sum(fold_train_auc_scores) / num_folds
        avg_test_auc = sum(fold_test_auc_scores) / num_folds
        avg_train_accuracy = sum(fold_train_accuracy_scores) / num_folds
        avg_test_accuracy = sum(fold_test_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average Training AUC ({} folds):".format(num_folds), avg_train_auc)
        print("Average Test AUC ({} folds):".format(num_folds), avg_test_auc)
        print("Average Training Accuracy ({} folds):".format(num_folds), avg_train_accuracy)
        print("Average Test Accuracy ({} folds):".format(num_folds), avg_test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Training AUC: 0.7440360695247237
Test AUC: 0.7476467049756738
Training Accuracy: 0.7440360695247238
Test Accuracy: 0.675674816392713
---------------------------------------------
Number of Folds: 5
Average Training AUC (5 folds): 0.7439782305336696
Average Test AUC (5 folds): 0.7475665422314114
Average Training Accuracy (5 folds): 0.7439773496724106
Average Test Accuracy (5 folds): 0.675725685944107
---------------------------------------------
Number of Folds: 10
Average Training AUC (10 folds): 0.7439852124482219
Average Test AUC (10 folds): 0.7475378560581469
Average Training Accuracy (10 folds): 0.7439850085494765
Average Test Accuracy (10 folds): 0.6757129685562585
---------------------------------------------
Number of Folds: 20
Average Training AUC (20 folds): 0.7440051985509251
Average Test AUC (20 folds): 0.7475476616005533
Average Training Accuracy (20 folds): 0.7440051643025376
Average Test Accuracy (20 folds): 0.67569230280

#### 70/30 Split

In [19]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using SGDClassifier
    model = SGDClassifier(loss='log', random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Training AUC: 0.745156508951705
Test AUC: 0.7464831972321369
Training Accuracy: 0.7451565089517052
Test Accuracy: 0.6750884927616101
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus, UWDecision
Training AUC: 0.7453899302070447
Test AUC: 0.7464904662292291
Training Accuracy: 0.7453899302070447
Test Accuracy: 0.6749401216642998
---------------------------------------------
Iteration 3: Using features Agency, WorkflowStatus, UWDecision, CommDateProvided
Training AUC: 0.7448355547256134
Test AUC: 0.7459778619705492
Training Accuracy: 0.7448355547256134
Test Accuracy: 0.6748341423090782
---------------------------------------------


#### Cross Validation

In [22]:
# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using SGDClassifier
    model = SGDClassifier(loss='log', random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")
    
    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_train_auc_scores = []
        fold_test_auc_scores = []
        fold_train_accuracy_scores = []
        fold_test_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data using SGDClassifier
            model = SGDClassifier(loss='log', random_state=42)
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)
            y_fold_test_pred = model.predict(X_test_scaled)

            # Calculate Training AUC, Test AUC, Training Accuracy, and Test Accuracy for the fold
            fold_train_auc = roc_auc_score(y_fold_train, model.predict(X_fold_train))
            fold_test_auc = roc_auc_score(y_test, y_fold_test_pred)
            fold_train_accuracy = accuracy_score(y_fold_train, model.predict(X_fold_train))
            fold_test_accuracy = accuracy_score(y_test, y_fold_test_pred)

            fold_train_auc_scores.append(fold_train_auc)
            fold_test_auc_scores.append(fold_test_auc)
            fold_train_accuracy_scores.append(fold_train_accuracy)
            fold_test_accuracy_scores.append(fold_test_accuracy)

        # Calculate average metrics for the current iteration's folds
        avg_train_auc = sum(fold_train_auc_scores) / num_folds
        avg_test_auc = sum(fold_test_auc_scores) / num_folds
        avg_train_accuracy = sum(fold_train_accuracy_scores) / num_folds
        avg_test_accuracy = sum(fold_test_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average Training AUC ({} folds):".format(num_folds), avg_train_auc)
        print("Average Test AUC ({} folds):".format(num_folds), avg_test_auc)
        print("Average Training Accuracy ({} folds):".format(num_folds), avg_train_accuracy)
        print("Average Test Accuracy ({} folds):".format(num_folds), avg_test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Training AUC: 0.745156508951705
Test AUC: 0.7464831972321369
Training Accuracy: 0.7451565089517052
Test Accuracy: 0.6750884927616101
---------------------------------------------
Number of Folds: 5
Average Training AUC (5 folds): 0.7452993650718579
Average Test AUC (5 folds): 0.7466107188508735
Average Training Accuracy (5 folds): 0.7452994801260862
Average Test Accuracy (5 folds): 0.6749782742321796
---------------------------------------------
Number of Folds: 10
Average Training AUC (10 folds): 0.7452649250891217
Average Test AUC (10 folds): 0.7466065110871618
Average Training Accuracy (10 folds): 0.7452647909543477
Average Test Accuracy (10 folds): 0.675014307212955
---------------------------------------------
Number of Folds: 20
Average Training AUC (20 folds): 0.7452636283854577
Average Test AUC (20 folds): 0.7465859638991424
Average Training Accuracy (20 folds): 0.7452636984514582
Average Test Accuracy (20 folds): 0.67502490514

### Decision Tree

#### 80/20 Split

In [24]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using DecisionTreeClassifier
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Training AUC: 0.7730949122771186
Test AUC: 0.7495734512155656
Training Accuracy: 0.7730949122771186
Test Accuracy: 0.7256223571678377
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus, UWDecision
Training AUC: 0.7885408794755009
Test AUC: 0.7485873427712145
Training Accuracy: 0.7885408794755009
Test Accuracy: 0.7295965408705052
---------------------------------------------
Iteration 3: Using features Agency, WorkflowStatus, UWDecision, CommDateProvided
Training AUC: 0.7924368374828946
Test AUC: 0.7497656571303801
Training Accuracy: 0.7924368374828945
Test Accuracy: 0.7300734429148253
---------------------------------------------


#### Cross-Validation

In [23]:
# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using DecisionTreeClassifier
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")
    
    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_train_auc_scores = []
        fold_test_auc_scores = []
        fold_train_accuracy_scores = []
        fold_test_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data using DecisionTreeClassifier
            model = DecisionTreeClassifier(random_state=42)
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)
            y_fold_test_pred = model.predict(X_test_scaled)

            # Calculate Training AUC, Test AUC, Training Accuracy, and Test Accuracy for the fold
            fold_train_auc = roc_auc_score(y_fold_train, model.predict(X_fold_train))
            fold_test_auc = roc_auc_score(y_test, y_fold_test_pred)
            fold_train_accuracy = accuracy_score(y_fold_train, model.predict(X_fold_train))
            fold_test_accuracy = accuracy_score(y_test, y_fold_test_pred)

            fold_train_auc_scores.append(fold_train_auc)
            fold_test_auc_scores.append(fold_test_auc)
            fold_train_accuracy_scores.append(fold_train_accuracy)
            fold_test_accuracy_scores.append(fold_test_accuracy)

        # Calculate average metrics for the current iteration's folds
        avg_train_auc = sum(fold_train_auc_scores) / num_folds
        avg_test_auc = sum(fold_test_auc_scores) / num_folds
        avg_train_accuracy = sum(fold_train_accuracy_scores) / num_folds
        avg_test_accuracy = sum(fold_test_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average Training AUC ({} folds):".format(num_folds), avg_train_auc)
        print("Average Test AUC ({} folds):".format(num_folds), avg_test_auc)
        print("Average Training Accuracy ({} folds):".format(num_folds), avg_train_accuracy)
        print("Average Test Accuracy ({} folds):".format(num_folds), avg_test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Training AUC: 0.7730949122771186
Test AUC: 0.7495734512155656
Training Accuracy: 0.7730949122771186
Test Accuracy: 0.7256223571678377
---------------------------------------------
Number of Folds: 5
Average Training AUC (5 folds): 0.7736783721854661
Average Test AUC (5 folds): 0.7497515566007457
Average Training Accuracy (5 folds): 0.7736770083827401
Average Test Accuracy (5 folds): 0.7257558897402473
---------------------------------------------
Number of Folds: 10
Average Training AUC (10 folds): 0.773361431259021
Average Test AUC (10 folds): 0.7494804584364758
Average Training Accuracy (10 folds): 0.7733615643401077
Average Test Accuracy (10 folds): 0.7262041776619083
---------------------------------------------
Number of Folds: 20
Average Training AUC (20 folds): 0.7732195899584628
Average Test AUC (20 folds): 0.7494345347509536
Average Training Accuracy (20 folds): 0.773219340022146
Average Test Accuracy (20 folds): 0.72559374304

#### 70/30 Split

In [25]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using DecisionTreeClassifier
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Training AUC: 0.7761898648490931
Test AUC: 0.7438931870365839
Training Accuracy: 0.7761898648490931
Test Accuracy: 0.7239661713898132
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus, UWDecision
Training AUC: 0.7925585303797763
Test AUC: 0.7435583040931643
Training Accuracy: 0.7925585303797764
Test Accuracy: 0.7286080671485194
---------------------------------------------
Iteration 3: Using features Agency, WorkflowStatus, UWDecision, CommDateProvided
Training AUC: 0.7963983100301113
Test AUC: 0.7446766750353746
Training Accuracy: 0.7963983100301113
Test Accuracy: 0.7297102524428242
---------------------------------------------


#### Cross Validation

In [26]:
# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using DecisionTreeClassifier
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")
    
    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_train_auc_scores = []
        fold_test_auc_scores = []
        fold_train_accuracy_scores = []
        fold_test_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data using DecisionTreeClassifier
            model = DecisionTreeClassifier(random_state=42)
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)
            y_fold_test_pred = model.predict(X_test_scaled)

            # Calculate Training AUC, Test AUC, Training Accuracy, and Test Accuracy for the fold
            fold_train_auc = roc_auc_score(y_fold_train, model.predict(X_fold_train))
            fold_test_auc = roc_auc_score(y_test, y_fold_test_pred)
            fold_train_accuracy = accuracy_score(y_fold_train, model.predict(X_fold_train))
            fold_test_accuracy = accuracy_score(y_test, y_fold_test_pred)

            fold_train_auc_scores.append(fold_train_auc)
            fold_test_auc_scores.append(fold_test_auc)
            fold_train_accuracy_scores.append(fold_train_accuracy)
            fold_test_accuracy_scores.append(fold_test_accuracy)

        # Calculate average metrics for the current iteration's folds
        avg_train_auc = sum(fold_train_auc_scores) / num_folds
        avg_test_auc = sum(fold_test_auc_scores) / num_folds
        avg_train_accuracy = sum(fold_train_accuracy_scores) / num_folds
        avg_test_accuracy = sum(fold_test_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average Training AUC ({} folds):".format(num_folds), avg_train_auc)
        print("Average Test AUC ({} folds):".format(num_folds), avg_test_auc)
        print("Average Training Accuracy ({} folds):".format(num_folds), avg_train_accuracy)
        print("Average Test Accuracy ({} folds):".format(num_folds), avg_test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Training AUC: 0.7761898648490931
Test AUC: 0.7438931870365839
Training Accuracy: 0.7761898648490931
Test Accuracy: 0.7239661713898132
---------------------------------------------
Number of Folds: 5
Average Training AUC (5 folds): 0.776751181860476
Average Test AUC (5 folds): 0.7446420580479323
Average Training Accuracy (5 folds): 0.7767515343874221
Average Test Accuracy (5 folds): 0.7241950867970919
---------------------------------------------
Number of Folds: 10
Average Training AUC (10 folds): 0.7764468304920825
Average Test AUC (10 folds): 0.7442643356744164
Average Training Accuracy (10 folds): 0.7764466284395992
Average Test Accuracy (10 folds): 0.7245066661014434
---------------------------------------------
Number of Folds: 20
Average Training AUC (20 folds): 0.7763124038464747
Average Test AUC (20 folds): 0.7442672792732121
Average Training Accuracy (20 folds): 0.7763124110532068
Average Test Accuracy (20 folds): 0.7243169630

### Random Forest

#### 80/20 Split

In [27]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using RandomForestClassifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Training AUC: 0.7730540634382468
Test AUC: 0.7486750558343944
Training Accuracy: 0.7730540634382468
Test Accuracy: 0.7276571392236034
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus, UWDecision
Training AUC: 0.7885204550560652
Test AUC: 0.7477021679182022
Training Accuracy: 0.788520455056065
Test Accuracy: 0.7306139318983881
---------------------------------------------
Iteration 3: Using features Agency, WorkflowStatus, UWDecision, CommDateProvided
Training AUC: 0.7923959886440227
Test AUC: 0.748910424005334
Training Accuracy: 0.7923959886440228
Test Accuracy: 0.7313769751693002
---------------------------------------------


#### Cross Validation

In [29]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using RandomForestClassifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_train_auc_scores = []
        fold_test_auc_scores = []
        fold_train_accuracy_scores = []
        fold_test_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data using RandomForestClassifier
            model = RandomForestClassifier(random_state=42)
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)
            y_fold_test_pred = model.predict(X_test_scaled)

            # Calculate Training AUC, Test AUC, Training Accuracy, and Test Accuracy for the fold
            fold_train_auc = roc_auc_score(y_fold_train, model.predict(X_fold_train))
            fold_test_auc = roc_auc_score(y_test, y_fold_test_pred)
            fold_train_accuracy = accuracy_score(y_fold_train, model.predict(X_fold_train))
            fold_test_accuracy = accuracy_score(y_test, y_fold_test_pred)

            fold_train_auc_scores.append(fold_train_auc)
            fold_test_auc_scores.append(fold_test_auc)
            fold_train_accuracy_scores.append(fold_train_accuracy)
            fold_test_accuracy_scores.append(fold_test_accuracy)

        # Calculate average metrics for the current iteration's folds
        avg_train_auc = sum(fold_train_auc_scores) / num_folds
        avg_test_auc = sum(fold_test_auc_scores) / num_folds
        avg_train_accuracy = sum(fold_train_accuracy_scores) / num_folds
        avg_test_accuracy = sum(fold_test_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average Training AUC ({} folds):".format(num_folds), avg_train_auc)
        print("Average Test AUC ({} folds):".format(num_folds), avg_test_auc)
        print("Average Training Accuracy ({} folds):".format(num_folds), avg_train_accuracy)
        print("Average Test Accuracy ({} folds):".format(num_folds), avg_test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Training AUC: 0.7730540634382468
Test AUC: 0.7486750558343944
Training Accuracy: 0.7730540634382468
Test Accuracy: 0.7276571392236034
---------------------------------------------
Number of Folds: 5
Average Training AUC (5 folds): 0.7736698335565881
Average Test AUC (5 folds): 0.7492634405226688
Average Training Accuracy (5 folds): 0.7736680727204205
Average Test Accuracy (5 folds): 0.7271039328521922
---------------------------------------------
Number of Folds: 10
Average Training AUC (10 folds): 0.7733518710228787
Average Test AUC (10 folds): 0.7489419766281452
Average Training Accuracy (10 folds): 0.7733513521310335
Average Test Accuracy (10 folds): 0.7272342860776397
---------------------------------------------
Number of Folds: 20
Average Training AUC (20 folds): 0.7732048756805616
Average Test AUC (20 folds): 0.7488780575862186
Average Training Accuracy (20 folds): 0.7732050966735463
Average Test Accuracy (20 folds): 0.726991066

#### 70/30 Split

In [28]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using RandomForestClassifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Training AUC: 0.7761665227235592
Test AUC: 0.7430080294255877
Training Accuracy: 0.7761665227235592
Test Accuracy: 0.7249835731999407
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus, UWDecision
Training AUC: 0.7925293527228588
Test AUC: 0.742759654448101
Training Accuracy: 0.792529352722859
Test Accuracy: 0.7304521079293753
---------------------------------------------
Iteration 3: Using features Agency, WorkflowStatus, UWDecision, CommDateProvided
Training AUC: 0.7963749679045774
Test AUC: 0.7438111735320451
Training Accuracy: 0.7963749679045774
Test Accuracy: 0.731024396447572
---------------------------------------------


#### Cross Validation

In [30]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using RandomForestClassifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_train_auc_scores = []
        fold_test_auc_scores = []
        fold_train_accuracy_scores = []
        fold_test_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data using RandomForestClassifier
            model = RandomForestClassifier(random_state=42)
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)
            y_fold_test_pred = model.predict(X_test_scaled)

            # Calculate Training AUC, Test AUC, Training Accuracy, and Test Accuracy for the fold
            fold_train_auc = roc_auc_score(y_fold_train, model.predict(X_fold_train))
            fold_test_auc = roc_auc_score(y_test, y_fold_test_pred)
            fold_train_accuracy = accuracy_score(y_fold_train, model.predict(X_fold_train))
            fold_test_accuracy = accuracy_score(y_test, y_fold_test_pred)

            fold_train_auc_scores.append(fold_train_auc)
            fold_test_auc_scores.append(fold_test_auc)
            fold_train_accuracy_scores.append(fold_train_accuracy)
            fold_test_accuracy_scores.append(fold_test_accuracy)

        # Calculate average metrics for the current iteration's folds
        avg_train_auc = sum(fold_train_auc_scores) / num_folds
        avg_test_auc = sum(fold_test_auc_scores) / num_folds
        avg_train_accuracy = sum(fold_train_accuracy_scores) / num_folds
        avg_test_accuracy = sum(fold_test_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average Training AUC ({} folds):".format(num_folds), avg_train_auc)
        print("Average Test AUC ({} folds):".format(num_folds), avg_test_auc)
        print("Average Training Accuracy ({} folds):".format(num_folds), avg_train_accuracy)
        print("Average Test Accuracy ({} folds):".format(num_folds), avg_test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Training AUC: 0.7761665227235592
Test AUC: 0.7430080294255877
Training Accuracy: 0.7761665227235592
Test Accuracy: 0.7249835731999407
---------------------------------------------
Number of Folds: 5
Average Training AUC (5 folds): 0.7767366541922871
Average Test AUC (5 folds): 0.7437841731723995
Average Training Accuracy (5 folds): 0.7767384044439376
Average Test Accuracy (5 folds): 0.7257127111638653
---------------------------------------------
Number of Folds: 10
Average Training AUC (10 folds): 0.7764327196636431
Average Test AUC (10 folds): 0.7436426675552263
Average Training Accuracy (10 folds): 0.7764323638022833
Average Test Accuracy (10 folds): 0.725220966955637
---------------------------------------------
Number of Folds: 20
Average Training AUC (20 folds): 0.7762995840704344
Average Test AUC (20 folds): 0.7436611214074899
Average Training Accuracy (20 folds): 0.7762995114507252
Average Test Accuracy (20 folds): 0.7249888721

### LightGBM

#### 80/20 Split

In [32]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using LGBMClassifier
    model = lgb.LGBMClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Training AUC: 0.7484630624374502
Test AUC: 0.7512247253557579
Training Accuracy: 0.7484630624374502
Test Accuracy: 0.694019648364226
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus, UWDecision
Training AUC: 0.7519045771123956
Test AUC: 0.7537992774754331
Training Accuracy: 0.7519045771123956
Test Accuracy: 0.6941150287730901
---------------------------------------------
Iteration 3: Using features Agency, WorkflowStatus, UWDecision, CommDateProvided
Training AUC: 0.7537683053859194
Test AUC: 0.7553705371006395
Training Accuracy: 0.7537683053859194
Test Accuracy: 0.6970400279782533
---------------------------------------------


#### Cross Validation

In [34]:
# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using LGBMClassifier
    model = lgb.LGBMClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_train_auc_scores = []
        fold_test_auc_scores = []
        fold_train_accuracy_scores = []
        fold_test_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data using LGBMClassifier
            model = lgb.LGBMClassifier(random_state=42)
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)
            y_fold_test_pred = model.predict(X_test_scaled)

            # Calculate Training AUC, Test AUC, Training Accuracy, and Test Accuracy for the fold
            fold_train_auc = roc_auc_score(y_fold_train, model.predict(X_fold_train))
            fold_test_auc = roc_auc_score(y_test, y_fold_test_pred)
            fold_train_accuracy = accuracy_score(y_fold_train, model.predict(X_fold_train))
            fold_test_accuracy = accuracy_score(y_test, y_fold_test_pred)

            fold_train_auc_scores.append(fold_train_auc)
            fold_test_auc_scores.append(fold_test_auc)
            fold_train_accuracy_scores.append(fold_train_accuracy)
            fold_test_accuracy_scores.append(fold_test_accuracy)

        # Calculate average metrics for the current iteration's folds
        avg_train_auc = sum(fold_train_auc_scores) / num_folds
        avg_test_auc = sum(fold_test_auc_scores) / num_folds
        avg_train_accuracy = sum(fold_train_accuracy_scores) / num_folds
        avg_test_accuracy = sum(fold_test_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average Training AUC ({} folds):".format(num_folds), avg_train_auc)
        print("Average Test AUC ({} folds):".format(num_folds), avg_test_auc)
        print("Average Training Accuracy ({} folds):".format(num_folds), avg_train_accuracy)
        print("Average Test Accuracy ({} folds):".format(num_folds), avg_test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Training AUC: 0.7484630624374502
Test AUC: 0.7512247253557579
Training Accuracy: 0.7484630624374502
Test Accuracy: 0.694019648364226
---------------------------------------------
Number of Folds: 5
Average Training AUC (5 folds): 0.7486228439857252
Average Test AUC (5 folds): 0.7513575680818239
Average Training Accuracy (5 folds): 0.7486213518347254
Average Test Accuracy (5 folds): 0.6934918767685118
---------------------------------------------
Number of Folds: 10
Average Training AUC (10 folds): 0.7485928828549258
Average Test AUC (10 folds): 0.7515352106166939
Average Training Accuracy (10 folds): 0.748592417095154
Average Test Accuracy (10 folds): 0.6937843766890281
---------------------------------------------
Number of Folds: 20
Average Training AUC (20 folds): 0.7486304763338627
Average Test AUC (20 folds): 0.7516819726870041
Average Training Accuracy (20 folds): 0.7486302203931194
Average Test Accuracy (20 folds): 0.69393698534

#### 70/30 Split

In [35]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using LGBMClassifier
    model = lgb.LGBMClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Training AUC: 0.7490488083844914
Test AUC: 0.7503466034363131
Training Accuracy: 0.7490488083844915
Test Accuracy: 0.6870641599016512
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus, UWDecision
Training AUC: 0.753565509675311
Test AUC: 0.7514926103685433
Training Accuracy: 0.753565509675311
Test Accuracy: 0.6927870450836177
---------------------------------------------
Iteration 3: Using features Agency, WorkflowStatus, UWDecision, CommDateProvided
Training AUC: 0.7553220046217409
Test AUC: 0.7540282938270958
Training Accuracy: 0.7553220046217408
Test Accuracy: 0.6956696835456453
---------------------------------------------


#### Cross Validation

In [36]:
# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using LGBMClassifier
    model = lgb.LGBMClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_train_auc_scores = []
        fold_test_auc_scores = []
        fold_train_accuracy_scores = []
        fold_test_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data using LGBMClassifier
            model = lgb.LGBMClassifier(random_state=42)
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)
            y_fold_test_pred = model.predict(X_test_scaled)

            # Calculate Training AUC, Test AUC, Training Accuracy, and Test Accuracy for the fold
            fold_train_auc = roc_auc_score(y_fold_train, model.predict(X_fold_train))
            fold_test_auc = roc_auc_score(y_test, y_fold_test_pred)
            fold_train_accuracy = accuracy_score(y_fold_train, model.predict(X_fold_train))
            fold_test_accuracy = accuracy_score(y_test, y_fold_test_pred)

            fold_train_auc_scores.append(fold_train_auc)
            fold_test_auc_scores.append(fold_test_auc)
            fold_train_accuracy_scores.append(fold_train_accuracy)
            fold_test_accuracy_scores.append(fold_test_accuracy)

        # Calculate average metrics for the current iteration's folds
        avg_train_auc = sum(fold_train_auc_scores) / num_folds
        avg_test_auc = sum(fold_test_auc_scores) / num_folds
        avg_train_accuracy = sum(fold_train_accuracy_scores) / num_folds
        avg_test_accuracy = sum(fold_test_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average Training AUC ({} folds):".format(num_folds), avg_train_auc)
        print("Average Test AUC ({} folds):".format(num_folds), avg_test_auc)
        print("Average Training Accuracy ({} folds):".format(num_folds), avg_train_accuracy)
        print("Average Test Accuracy ({} folds):".format(num_folds), avg_test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Training AUC: 0.7490488083844914
Test AUC: 0.7503466034363131
Training Accuracy: 0.7490488083844915
Test Accuracy: 0.6870641599016512
---------------------------------------------
Number of Folds: 5
Average Training AUC (5 folds): 0.7495222158590036
Average Test AUC (5 folds): 0.7501561960741124
Average Training Accuracy (5 folds): 0.7495287808237527
Average Test Accuracy (5 folds): 0.6914093134657369
---------------------------------------------
Number of Folds: 10
Average Training AUC (10 folds): 0.7493608041817259
Average Test AUC (10 folds): 0.74991442321564
Average Training Accuracy (10 folds): 0.7493626306063883
Average Test Accuracy (10 folds): 0.6898832107505458
---------------------------------------------
Number of Folds: 20
Average Training AUC (20 folds): 0.749280750705894
Average Test AUC (20 folds): 0.7499847866648174
Average Training Accuracy (20 folds): 0.749281308223155
Average Test Accuracy (20 folds): 0.6903813137200

## Neural Network

#### 8020 Split

Loop over epochs

In [38]:
# List of epochs to iterate over
epochs_list = [10, 20, 30]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using a neural network
    model = Sequential()
    model.add(Dense(units=64, activation='relu', input_dim=len(features)))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # Loop over epochs
    for epochs in epochs_list:
        print(f"Epochs: {epochs}")
        
        model.fit(X_resampled, y_resampled, epochs=epochs, batch_size=32, verbose=0)

        # Predict on the test data
        y_pred_train = np.round(model.predict(X_resampled)).astype(int)
        y_pred_test = np.round(model.predict(X_test_scaled)).astype(int)

        # Calculate Training AUC and Test AUC
        train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
        test_auc = roc_auc_score(y_test, y_pred_test)

        # Calculate Training Accuracy and Test Accuracy
        train_accuracy = accuracy_score(y_resampled, y_pred_train)
        test_accuracy = accuracy_score(y_test, y_pred_test)

        # Print evaluation metrics for training and test data
        print("Training AUC:", train_auc)
        print("Test AUC:", test_auc)
        print("Training Accuracy:", train_accuracy)
        print("Test Accuracy:", test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Epochs: 10
Training AUC: 0.7784957515286566
Test AUC: 0.7476158375470874
Training Accuracy: 0.7440667061538776
Test Accuracy: 0.6757066098623343
---------------------------------------------
Epochs: 20
Training AUC: 0.78142820316126
Test AUC: 0.7474974591861525
Training Accuracy: 0.7438777802740957
Test Accuracy: 0.6740851429116459
---------------------------------------------
Epochs: 30
Training AUC: 0.7820888560185559
Test AUC: 0.7474823872567538
Training Accuracy: 0.7444139212842875
Test Accuracy: 0.6757384033319556
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus, UWDecision
Epochs: 10
Training AUC: 0.7849537866859659
Test AUC: 0.7485037045484217
Training Accuracy: 0.7443475419211208
Test Accuracy: 0.6913807903856548
---------------------------------------------
Epochs: 20
Training AUC: 0.7875087629388271
Test AUC: 0.7472127681628731
Training Accuracy: 0.744168828251057
Test Accuracy:

#### 70/30 Split

In [39]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using a neural network
    model = Sequential()
    model.add(Dense(units=64, activation='relu', input_dim=len(features)))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # Loop over epochs
    for epochs in epochs_list:
        print(f"Epochs: {epochs}")
        
        model.fit(X_resampled, y_resampled, epochs=epochs, batch_size=32, verbose=0)

        # Predict on the test data
        y_pred_train = np.round(model.predict(X_resampled)).astype(int)
        y_pred_test = np.round(model.predict(X_test_scaled)).astype(int)

        # Calculate Training AUC and Test AUC
        train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
        test_auc = roc_auc_score(y_test, y_pred_test)

        # Calculate Training Accuracy and Test Accuracy
        train_accuracy = accuracy_score(y_resampled, y_pred_train)
        test_accuracy = accuracy_score(y_test, y_pred_test)

        # Print evaluation metrics for training and test data
        print("Training AUC:", train_auc)
        print("Test AUC:", test_auc)
        print("Training Accuracy:", train_accuracy)
        print("Test Accuracy:", test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus
Epochs: 10
Training AUC: 0.7785195319511153
Test AUC: 0.7465243530619929
Training Accuracy: 0.7451973576713895
Test Accuracy: 0.6750461010195214
---------------------------------------------
Epochs: 20
Training AUC: 0.7816407806920531
Test AUC: 0.7467155942170896
Training Accuracy: 0.745489134240564
Test Accuracy: 0.6751308845036986
---------------------------------------------
Epochs: 30
Training AUC: 0.7825015056341141
Test AUC: 0.7468590250834121
Training Accuracy: 0.7455766672113162
Test Accuracy: 0.6751944721168316
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus, UWDecision
Epochs: 10
Training AUC: 0.7857355644551405
Test AUC: 0.7466275499057208
Training Accuracy: 0.7455358184916319
Test Accuracy: 0.6748341423090782
---------------------------------------------
Epochs: 20
Training AUC: 0.7882537044689049
Test AUC: 0.7475437143527823
Training Accuracy: 0.7453374104245932
Test Accurac