In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb

from imblearn.over_sampling import SMOTE

from keras.models import Sequential
from keras.layers import Dense

import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [6]:
df = pd.read_excel('Dataset.xlsx')

## Clean Data

In [7]:
# Get the list of columns to convert to categorical
categorical_columns = df.select_dtypes(include='int64').columns.tolist()

# Convert the selected columns to categorical
df[categorical_columns] = df[categorical_columns].astype('category')

numeric_data = df.select_dtypes(include=[np.number])

# Calculate MAD for each column
mad = numeric_data.mad()

# Choose a threshold multiplier
k = 3

# Calculate the threshold value
threshold = k * mad

# Identify outliers
outliers = (np.abs(numeric_data - numeric_data.median()) > threshold)

# Apply logarithm to the specified columns
outlier_columns = ['CommissionSacrificePercentage', 'BonusCommissionPercentage']
for column in outlier_columns:
    df[column] = np.log1p(df[column])
    
df = df.drop(columns=['PropDate'])


## Feature Engineering

In [8]:
# Convert categorical columns to strings
df['WorkflowStatus'] = df['WorkflowStatus'].astype(str)
df['UWDecision'] = df['UWDecision'].astype(str)

df['Product'] = df['Product'].astype(str)
df['ProductGroup'] = df['ProductGroup'].astype(str)
df['ProductType'] = df['ProductType'].astype(str)

df['CommissionSacrificeType'] = df['CommissionSacrificeType'].astype(str)
df['RenewalSacrificeType'] = df['RenewalSacrificeType'].astype(str)

df['CommDateProvided'] = df['CommDateProvided'].astype(str)
df['FreeCover'] = df['FreeCover'].astype(str)

# Combined Features
df['Combined_Status1'] = df['WorkflowStatus'] + '_' + df['UWDecision']
df['Combined_Status2'] = df['Product'] + '_' + df['ProductGroup'] + '_' + df['ProductType']
df['Combined_Status3'] = df['CommissionSacrificeType'] + '_' + df['RenewalSacrificeType']
df['Combined_Status4'] = df['CommDateProvided'] + '_' + df['FreeCover']

# Initialise LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the combined status
df['WorkflowStatus_UWDecision'] = label_encoder.fit_transform(df['Combined_Status1'])
df['Product_ProductGroup_ProductType'] = label_encoder.fit_transform(df['Combined_Status2'])
df['CommissionSacrificeType_RenewalSacrificeType'] = label_encoder.fit_transform(df['Combined_Status3'])
df['CommDateProvided_FreeCover'] = label_encoder.fit_transform(df['Combined_Status4'])


# Drop unnecessary columns
df.drop(['WorkflowStatus', 'UWDecision', 'Combined_Status1'], axis=1, inplace=True)
df.drop(['Product', 'ProductGroup', 'ProductType', 'Combined_Status2'], axis=1, inplace=True)
df.drop(['CommissionSacrificeType', 'RenewalSacrificeType', 'Combined_Status3'], axis=1, inplace=True)
df.drop(['CommDateProvided', 'FreeCover', 'Combined_Status4'], axis=1, inplace=True)

categorical_columns = df.select_dtypes(include='int32').columns.tolist()
df[categorical_columns] = df[categorical_columns].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157263 entries, 0 to 157262
Data columns (total 17 columns):
 #   Column                                        Non-Null Count   Dtype   
---  ------                                        --------------   -----   
 0   Agency                                        157263 non-null  category
 1   Indexation                                    157263 non-null  category
 2   NoOfLives                                     157263 non-null  category
 3   PaymentFreq                                   157263 non-null  category
 4   ComissionSacrifice                            157263 non-null  category
 5   CommissionSacrificePercentage                 157263 non-null  float64 
 6   CommissionTerms                               157263 non-null  category
 7   Discount                                      157263 non-null  category
 8   BonusCommission                               157263 non-null  category
 9   BonusCommissionPercentage            

# Top-n Features

## Logistic Regression

#### 80/20 Split

In [10]:
# Select specific columns for each iteration
feature_sets = [
    ['Agency', 'WorkflowStatus_UWDecision'],
    ['Agency', 'WorkflowStatus_UWDecision', 'CommDateProvided_FreeCover'],
    ['Agency', 'WorkflowStatus_UWDecision', 'CommDateProvided_FreeCover', 'Product_ProductGroup_ProductType'],
    ['Agency', 'WorkflowStatus_UWDecision', 'CommDateProvided_FreeCover', 'Product_ProductGroup_ProductType','CommissionSacrificePercentage'],
    ['Agency', 'WorkflowStatus_UWDecision', 'CommDateProvided_FreeCover', 'Product_ProductGroup_ProductType','CommissionSacrificePercentage','BonusCommissionPercentage'],
    ['Agency', 'WorkflowStatus_UWDecision', 'CommDateProvided_FreeCover', 'Product_ProductGroup_ProductType','CommissionSacrificePercentage','BonusCommissionPercentage','SignedDecReceived']
]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data
    model = LogisticRegression()
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus_UWDecision
Training AUC: 0.7439441596372622
Test AUC: 0.7476153746967774
Training Accuracy: 0.7439441596372623
Test Accuracy: 0.6758655772104409
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover
Training AUC: 0.7459253283225424
Test AUC: 0.7494802366784474
Training Accuracy: 0.7459253283225424
Test Accuracy: 0.6801259021397005
---------------------------------------------
Iteration 3: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover, Product_ProductGroup_ProductType
Training AUC: 0.7434131247319294
Test AUC: 0.7469247200101298
Training Accuracy: 0.7434131247319294
Test Accuracy: 0.680539217244778
---------------------------------------------
Iteration 4: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover, Product_ProductGroup_ProductType, CommissionSacrificePercentage
Training AUC: 0.74557300708727

#### Cross Validation

In [11]:

# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_auc_scores = []
        fold_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data
            model = LogisticRegression()
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)

            # Calculate AUC and accuracy for the fold
            fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
            fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

            fold_auc_scores.append(fold_auc)
            fold_accuracy_scores.append(fold_accuracy)

        # Calculate average AUC and accuracy for the current number of folds
        avg_auc = sum(fold_auc_scores) / num_folds
        avg_accuracy = sum(fold_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average AUC:", avg_auc)
        print("Average Accuracy:", avg_accuracy)
        print("---------------------------------------------")
        
        # Train the model on the entire training data
        model.fit(X_resampled, y_resampled)

        # Predict on the test data
        y_pred = model.predict(X_test_scaled)

        # Calculate Test AUC and Test Accuracy
        test_auc = roc_auc_score(y_test, y_pred)
        test_accuracy = accuracy_score(y_test, y_pred)

        # Print evaluation metrics for the test data
        print("Test AUC:", test_auc)
        print("Test Accuracy:", test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus_UWDecision
Number of Folds: 5
Average AUC: 0.7439607778623281
Average Accuracy: 0.7439492701738503
---------------------------------------------
Test AUC: 0.7476153746967774
Test Accuracy: 0.6758655772104409
---------------------------------------------
Number of Folds: 10
Average AUC: 0.7439567060261033
Average Accuracy: 0.7439492712134637
---------------------------------------------
Test AUC: 0.7476153746967774
Test Accuracy: 0.6758655772104409
---------------------------------------------
Number of Folds: 20
Average AUC: 0.743958086062096
Average Accuracy: 0.743949330908799
---------------------------------------------
Test AUC: 0.7476153746967774
Test Accuracy: 0.6758655772104409
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover
Number of Folds: 5
Average AUC: 0.7459358774341734
Average Accuracy: 0.7459253315290604
-------------------------------

#### 70/30 Split

In [12]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data
    model = LogisticRegression()
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus_UWDecision
Training AUC: 0.7451973576713895
Test AUC: 0.7466472060133872
Training Accuracy: 0.7451973576713895
Test Accuracy: 0.6751308845036986
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover
Training AUC: 0.7471814383417754
Test AUC: 0.7486929085507982
Training Accuracy: 0.7471814383417754
Test Accuracy: 0.6795396256809173
---------------------------------------------
Iteration 3: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover, Product_ProductGroup_ProductType
Training AUC: 0.7406981629747205
Test AUC: 0.7422477285249909
Training Accuracy: 0.7406981629747205
Test Accuracy: 0.6805782233620891
---------------------------------------------
Iteration 4: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover, Product_ProductGroup_ProductType, CommissionSacrificePercentage
Training AUC: 0.7451448378889

#### Cross Validation

In [13]:
# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_auc_scores = []
        fold_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data
            model = LogisticRegression()
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)

            # Calculate AUC and accuracy for the fold
            fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
            fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

            fold_auc_scores.append(fold_auc)
            fold_accuracy_scores.append(fold_accuracy)

        # Calculate average AUC and accuracy for the current number of folds
        avg_auc = sum(fold_auc_scores) / num_folds
        avg_accuracy = sum(fold_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average AUC:", avg_auc)
        print("Average Accuracy:", avg_accuracy)
        print("---------------------------------------------")
        
        # Train the model on the entire training data
        model.fit(X_resampled, y_resampled)

        # Predict on the test data
        y_pred = model.predict(X_test_scaled)

        # Calculate Test AUC and Test Accuracy
        test_auc = roc_auc_score(y_test, y_pred)
        test_accuracy = accuracy_score(y_test, y_pred)

        # Print evaluation metrics for the test data
        print("Test AUC:", test_auc)
        print("Test Accuracy:", test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus_UWDecision
Number of Folds: 5
Average AUC: 0.7451942961415332
Average Accuracy: 0.7451973662864031
---------------------------------------------
Test AUC: 0.7466472060133872
Test Accuracy: 0.6751308845036986
---------------------------------------------
Number of Folds: 10
Average AUC: 0.745195903338684
Average Accuracy: 0.7451973889281452
---------------------------------------------
Test AUC: 0.7466472060133872
Test Accuracy: 0.6751308845036986
---------------------------------------------
Number of Folds: 20
Average AUC: 0.7451858817533348
Average Accuracy: 0.7451973640625191
---------------------------------------------
Test AUC: 0.7466472060133872
Test Accuracy: 0.6751308845036986
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover
Number of Folds: 5
Average AUC: 0.7471784284954918
Average Accuracy: 0.7471814476381397
------------------------------

### Stochastic Gradient Descent

#### 80/20 Split

In [14]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using SGDClassifier
    model = SGDClassifier(loss='log', random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus_UWDecision
Training AUC: 0.7439441596372622
Test AUC: 0.7476153746967774
Training Accuracy: 0.7439441596372623
Test Accuracy: 0.6758655772104409
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover
Training AUC: 0.7459661771614142
Test AUC: 0.7494702561024588
Training Accuracy: 0.7459661771614142
Test Accuracy: 0.6800305217308364
---------------------------------------------
Iteration 3: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover, Product_ProductGroup_ProductType
Training AUC: 0.7449194256653255
Test AUC: 0.7484026537797891
Training Accuracy: 0.7449194256653254
Test Accuracy: 0.6802848694878072
---------------------------------------------
Iteration 4: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover, Product_ProductGroup_ProductType, CommissionSacrificePercentage
Training AUC: 0.7414013194174

#### Cross Validation

In [15]:
# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using SGDClassifier
    model = SGDClassifier(loss='log', random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")
    
    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_train_auc_scores = []
        fold_test_auc_scores = []
        fold_train_accuracy_scores = []
        fold_test_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data using SGDClassifier
            model = SGDClassifier(loss='log', random_state=42)
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)
            y_fold_test_pred = model.predict(X_test_scaled)

            # Calculate Training AUC, Test AUC, Training Accuracy, and Test Accuracy for the fold
            fold_train_auc = roc_auc_score(y_fold_train, model.predict(X_fold_train))
            fold_test_auc = roc_auc_score(y_test, y_fold_test_pred)
            fold_train_accuracy = accuracy_score(y_fold_train, model.predict(X_fold_train))
            fold_test_accuracy = accuracy_score(y_test, y_fold_test_pred)

            fold_train_auc_scores.append(fold_train_auc)
            fold_test_auc_scores.append(fold_test_auc)
            fold_train_accuracy_scores.append(fold_train_accuracy)
            fold_test_accuracy_scores.append(fold_test_accuracy)

        # Calculate average metrics for the current iteration's folds
        avg_train_auc = sum(fold_train_auc_scores) / num_folds
        avg_test_auc = sum(fold_test_auc_scores) / num_folds
        avg_train_accuracy = sum(fold_train_accuracy_scores) / num_folds
        avg_test_accuracy = sum(fold_test_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average Training AUC ({} folds):".format(num_folds), avg_train_auc)
        print("Average Test AUC ({} folds):".format(num_folds), avg_test_auc)
        print("Average Training Accuracy ({} folds):".format(num_folds), avg_train_accuracy)
        print("Average Test Accuracy ({} folds):".format(num_folds), avg_test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus_UWDecision
Training AUC: 0.7439441596372622
Test AUC: 0.7476153746967774
Training Accuracy: 0.7439441596372623
Test Accuracy: 0.6758655772104409
---------------------------------------------
Number of Folds: 5
Average Training AUC (5 folds): 0.743932136668168
Average Test AUC (5 folds): 0.7475723454368806
Average Training Accuracy (5 folds): 0.743931394653722
Average Test Accuracy (5 folds): 0.6758465011286681
---------------------------------------------
Number of Folds: 10
Average Training AUC (10 folds): 0.742540189707563
Average Test AUC (10 folds): 0.7461322048597501
Average Training Accuracy (10 folds): 0.7425405434342828
Average Test Accuracy (10 folds): 0.6772136203223857
---------------------------------------------
Number of Folds: 20
Average Training AUC (20 folds): 0.7433994854811516
Average Test AUC (20 folds): 0.7470236356620001
Average Training Accuracy (20 folds): 0.7433986134154545
Average Test Accuracy (20 folds): 0.6

#### 70/30 Split

In [16]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using SGDClassifier
    model = SGDClassifier(loss='log', random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus_UWDecision
Training AUC: 0.7451915221400061
Test AUC: 0.7464831972321369
Training Accuracy: 0.7451915221400061
Test Accuracy: 0.6750884927616101
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover
Training AUC: 0.7471697672790084
Test AUC: 0.7487201409246443
Training Accuracy: 0.7471697672790084
Test Accuracy: 0.679582017423006
---------------------------------------------
Iteration 3: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover, Product_ProductGroup_ProductType
Training AUC: 0.7321724516234449
Test AUC: 0.7344017139386356
Training Accuracy: 0.7321724516234448
Test Accuracy: 0.6835880370503826
---------------------------------------------
Iteration 4: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover, Product_ProductGroup_ProductType, CommissionSacrificePercentage
Training AUC: 0.73661912653766

#### Cross Validation

In [17]:
# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using SGDClassifier
    model = SGDClassifier(loss='log', random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")
    
    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_train_auc_scores = []
        fold_test_auc_scores = []
        fold_train_accuracy_scores = []
        fold_test_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data using SGDClassifier
            model = SGDClassifier(loss='log', random_state=42)
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)
            y_fold_test_pred = model.predict(X_test_scaled)

            # Calculate Training AUC, Test AUC, Training Accuracy, and Test Accuracy for the fold
            fold_train_auc = roc_auc_score(y_fold_train, model.predict(X_fold_train))
            fold_test_auc = roc_auc_score(y_test, y_fold_test_pred)
            fold_train_accuracy = accuracy_score(y_fold_train, model.predict(X_fold_train))
            fold_test_accuracy = accuracy_score(y_test, y_fold_test_pred)

            fold_train_auc_scores.append(fold_train_auc)
            fold_test_auc_scores.append(fold_test_auc)
            fold_train_accuracy_scores.append(fold_train_accuracy)
            fold_test_accuracy_scores.append(fold_test_accuracy)

        # Calculate average metrics for the current iteration's folds
        avg_train_auc = sum(fold_train_auc_scores) / num_folds
        avg_test_auc = sum(fold_test_auc_scores) / num_folds
        avg_train_accuracy = sum(fold_train_accuracy_scores) / num_folds
        avg_test_accuracy = sum(fold_test_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average Training AUC ({} folds):".format(num_folds), avg_train_auc)
        print("Average Test AUC ({} folds):".format(num_folds), avg_test_auc)
        print("Average Training Accuracy ({} folds):".format(num_folds), avg_train_accuracy)
        print("Average Test Accuracy ({} folds):".format(num_folds), avg_test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus_UWDecision
Training AUC: 0.7451915221400061
Test AUC: 0.7464831972321369
Training Accuracy: 0.7451915221400061
Test Accuracy: 0.6750884927616101
---------------------------------------------
Number of Folds: 5
Average Training AUC (5 folds): 0.7390157425824853
Average Test AUC (5 folds): 0.7412209240311606
Average Training Accuracy (5 folds): 0.7390248560880666
Average Test Accuracy (5 folds): 0.6795014731130375
---------------------------------------------
Number of Folds: 10
Average Training AUC (10 folds): 0.7438147539841916
Average Test AUC (10 folds): 0.7454144352746497
Average Training Accuracy (10 folds): 0.7438143406138545
Average Test Accuracy (10 folds): 0.6764535068568642
---------------------------------------------
Number of Folds: 20
Average Training AUC (20 folds): 0.7442956986605849
Average Test AUC (20 folds): 0.7456275501478904
Average Training Accuracy (20 folds): 0.7442956150448106
Average Test Accuracy (20 folds): 

### Decision Tree

#### 80/20 Split

In [18]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using DecisionTreeClassifier
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus_UWDecision
Training AUC: 0.7885408794755009
Test AUC: 0.7485773621952256
Training Accuracy: 0.7885408794755009
Test Accuracy: 0.7295011604616412
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover
Training AUC: 0.7924674741120483
Test AUC: 0.749970360003565
Training Accuracy: 0.7924674741120483
Test Accuracy: 0.730232410262932
---------------------------------------------
Iteration 3: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover, Product_ProductGroup_ProductType
Training AUC: 0.8157717366883847
Test AUC: 0.7372691791868846
Training Accuracy: 0.8157717366883847
Test Accuracy: 0.7377674625631895
---------------------------------------------
Iteration 4: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover, Product_ProductGroup_ProductType, CommissionSacrificePercentage
Training AUC: 0.832463593472355

#### Cross-Validation

In [19]:
# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using DecisionTreeClassifier
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")
    
    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_train_auc_scores = []
        fold_test_auc_scores = []
        fold_train_accuracy_scores = []
        fold_test_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data using DecisionTreeClassifier
            model = DecisionTreeClassifier(random_state=42)
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)
            y_fold_test_pred = model.predict(X_test_scaled)

            # Calculate Training AUC, Test AUC, Training Accuracy, and Test Accuracy for the fold
            fold_train_auc = roc_auc_score(y_fold_train, model.predict(X_fold_train))
            fold_test_auc = roc_auc_score(y_test, y_fold_test_pred)
            fold_train_accuracy = accuracy_score(y_fold_train, model.predict(X_fold_train))
            fold_test_accuracy = accuracy_score(y_test, y_fold_test_pred)

            fold_train_auc_scores.append(fold_train_auc)
            fold_test_auc_scores.append(fold_test_auc)
            fold_train_accuracy_scores.append(fold_train_accuracy)
            fold_test_accuracy_scores.append(fold_test_accuracy)

        # Calculate average metrics for the current iteration's folds
        avg_train_auc = sum(fold_train_auc_scores) / num_folds
        avg_test_auc = sum(fold_test_auc_scores) / num_folds
        avg_train_accuracy = sum(fold_train_accuracy_scores) / num_folds
        avg_test_accuracy = sum(fold_test_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average Training AUC ({} folds):".format(num_folds), avg_train_auc)
        print("Average Test AUC ({} folds):".format(num_folds), avg_test_auc)
        print("Average Training Accuracy ({} folds):".format(num_folds), avg_train_accuracy)
        print("Average Test Accuracy ({} folds):".format(num_folds), avg_test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus_UWDecision
Training AUC: 0.7885408794755009
Test AUC: 0.7485773621952256
Training Accuracy: 0.7885408794755009
Test Accuracy: 0.7295011604616412
---------------------------------------------
Number of Folds: 5
Average Training AUC (5 folds): 0.7894537148505918
Average Test AUC (5 folds): 0.7483430357304268
Average Training Accuracy (5 folds): 0.7894535955107211
Average Test Accuracy (5 folds): 0.7282421390646362
---------------------------------------------
Number of Folds: 10
Average Training AUC (10 folds): 0.7889545377405642
Average Test AUC (10 folds): 0.7480283730989712
Average Training Accuracy (10 folds): 0.7889544737395555
Average Test Accuracy (10 folds): 0.7285028455155311
---------------------------------------------
Number of Folds: 20
Average Training AUC (20 folds): 0.7887415750429241
Average Test AUC (20 folds): 0.7483579299897544
Average Training Accuracy (20 folds): 0.7887416300268568
Average Test Accuracy (20 folds): 

#### 70/30 Split

In [20]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using DecisionTreeClassifier
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus_UWDecision
Training AUC: 0.7925585303797763
Test AUC: 0.7435855364670104
Training Accuracy: 0.7925585303797764
Test Accuracy: 0.7286504588906081
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover
Training AUC: 0.7964391587497958
Test AUC: 0.7446225175567698
Training Accuracy: 0.7964391587497958
Test Accuracy: 0.7295194896034253
---------------------------------------------
Iteration 3: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover, Product_ProductGroup_ProductType
Training AUC: 0.8198162974720478
Test AUC: 0.7345028979094447
Training Accuracy: 0.8198162974720478
Test Accuracy: 0.7359630344008987
---------------------------------------------
Iteration 4: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover, Product_ProductGroup_ProductType, CommissionSacrificePercentage
Training AUC: 0.8370486216474

#### Cross Validation

In [21]:
# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using DecisionTreeClassifier
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")
    
    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_train_auc_scores = []
        fold_test_auc_scores = []
        fold_train_accuracy_scores = []
        fold_test_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data using DecisionTreeClassifier
            model = DecisionTreeClassifier(random_state=42)
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)
            y_fold_test_pred = model.predict(X_test_scaled)

            # Calculate Training AUC, Test AUC, Training Accuracy, and Test Accuracy for the fold
            fold_train_auc = roc_auc_score(y_fold_train, model.predict(X_fold_train))
            fold_test_auc = roc_auc_score(y_test, y_fold_test_pred)
            fold_train_accuracy = accuracy_score(y_fold_train, model.predict(X_fold_train))
            fold_test_accuracy = accuracy_score(y_test, y_fold_test_pred)

            fold_train_auc_scores.append(fold_train_auc)
            fold_test_auc_scores.append(fold_test_auc)
            fold_train_accuracy_scores.append(fold_train_accuracy)
            fold_test_accuracy_scores.append(fold_test_accuracy)

        # Calculate average metrics for the current iteration's folds
        avg_train_auc = sum(fold_train_auc_scores) / num_folds
        avg_test_auc = sum(fold_test_auc_scores) / num_folds
        avg_train_accuracy = sum(fold_train_accuracy_scores) / num_folds
        avg_test_accuracy = sum(fold_test_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average Training AUC ({} folds):".format(num_folds), avg_train_auc)
        print("Average Test AUC ({} folds):".format(num_folds), avg_test_auc)
        print("Average Training Accuracy ({} folds):".format(num_folds), avg_train_accuracy)
        print("Average Test Accuracy ({} folds):".format(num_folds), avg_test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus_UWDecision
Training AUC: 0.7925585303797763
Test AUC: 0.7435855364670104
Training Accuracy: 0.7925585303797764
Test Accuracy: 0.7286504588906081
---------------------------------------------
Number of Folds: 5
Average Training AUC (5 folds): 0.7934430067624512
Average Test AUC (5 folds): 0.7433676582068248
Average Training Accuracy (5 folds): 0.7934440718717212
Average Test Accuracy (5 folds): 0.72737446745374
---------------------------------------------
Number of Folds: 10
Average Training AUC (10 folds): 0.7929573977816702
Average Test AUC (10 folds): 0.7435474936896435
Average Training Accuracy (10 folds): 0.7929579400114797
Average Test Accuracy (10 folds): 0.7280908878950381
---------------------------------------------
Number of Folds: 20
Average Training AUC (20 folds): 0.7927513315006736
Average Test AUC (20 folds): 0.7434178835525052
Average Training Accuracy (20 folds): 0.792751410045147
Average Test Accuracy (20 folds): 0.7

### Random Forest

#### 80/20 Split

In [22]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using RandomForestClassifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus_UWDecision
Training AUC: 0.7885204550560649
Test AUC: 0.7477225919204897
Training Accuracy: 0.788520455056065
Test Accuracy: 0.7306457253680094
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover
Training AUC: 0.7924317313780356
Test AUC: 0.7488182845698854
Training Accuracy: 0.7924317313780356
Test Accuracy: 0.7313133882300575
---------------------------------------------
Iteration 3: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover, Product_ProductGroup_ProductType
Training AUC: 0.8157513122689487
Test AUC: 0.7369431187200735
Training Accuracy: 0.8157513122689487
Test Accuracy: 0.7405334944202461
---------------------------------------------
Iteration 4: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover, Product_ProductGroup_ProductType, CommissionSacrificePercentage
Training AUC: 0.83244316905291

#### Cross Validation

In [23]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using RandomForestClassifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_train_auc_scores = []
        fold_test_auc_scores = []
        fold_train_accuracy_scores = []
        fold_test_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data using RandomForestClassifier
            model = RandomForestClassifier(random_state=42)
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)
            y_fold_test_pred = model.predict(X_test_scaled)

            # Calculate Training AUC, Test AUC, Training Accuracy, and Test Accuracy for the fold
            fold_train_auc = roc_auc_score(y_fold_train, model.predict(X_fold_train))
            fold_test_auc = roc_auc_score(y_test, y_fold_test_pred)
            fold_train_accuracy = accuracy_score(y_fold_train, model.predict(X_fold_train))
            fold_test_accuracy = accuracy_score(y_test, y_fold_test_pred)

            fold_train_auc_scores.append(fold_train_auc)
            fold_test_auc_scores.append(fold_test_auc)
            fold_train_accuracy_scores.append(fold_train_accuracy)
            fold_test_accuracy_scores.append(fold_test_accuracy)

        # Calculate average metrics for the current iteration's folds
        avg_train_auc = sum(fold_train_auc_scores) / num_folds
        avg_test_auc = sum(fold_test_auc_scores) / num_folds
        avg_train_accuracy = sum(fold_train_accuracy_scores) / num_folds
        avg_test_accuracy = sum(fold_test_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average Training AUC ({} folds):".format(num_folds), avg_train_auc)
        print("Average Test AUC ({} folds):".format(num_folds), avg_test_auc)
        print("Average Training Accuracy ({} folds):".format(num_folds), avg_train_accuracy)
        print("Average Test Accuracy ({} folds):".format(num_folds), avg_test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus_UWDecision
Training AUC: 0.7885204550560649
Test AUC: 0.7477225919204897
Training Accuracy: 0.788520455056065
Test Accuracy: 0.7306457253680094
---------------------------------------------
Number of Folds: 5
Average Training AUC (5 folds): 0.7894372291008462
Average Test AUC (5 folds): 0.7476448359978395
Average Training Accuracy (5 folds): 0.789437000681336
Average Test Accuracy (5 folds): 0.7298699647092487
---------------------------------------------
Number of Folds: 10
Average Training AUC (10 folds): 0.7889284112831405
Average Test AUC (10 folds): 0.7474144209111038
Average Training Accuracy (10 folds): 0.788928375868345
Average Test Accuracy (10 folds): 0.7300861603026739
---------------------------------------------
Number of Folds: 20
Average Training AUC (20 folds): 0.7887229983295444
Average Test AUC (20 folds): 0.7477813427114072
Average Training Accuracy (20 folds): 0.7887230868007702
Average Test Accuracy (20 folds): 0.7

#### 70/30 Split

In [24]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using RandomForestClassifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus_UWDecision
Training AUC: 0.7925293527228588
Test AUC: 0.7427118441593269
Training Accuracy: 0.792529352722859
Test Accuracy: 0.730430912058331
---------------------------------------------
Iteration 2: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover
Training AUC: 0.7964158166242619
Test AUC: 0.743777286699281
Training Accuracy: 0.7964158166242618
Test Accuracy: 0.7309184170923504
---------------------------------------------
Iteration 3: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover, Product_ProductGroup_ProductType
Training AUC: 0.8197929553465138
Test AUC: 0.733335276680111
Training Accuracy: 0.8197929553465139
Test Accuracy: 0.7400538375124526
---------------------------------------------
Iteration 4: Using features Agency, WorkflowStatus_UWDecision, CommDateProvided_FreeCover, Product_ProductGroup_ProductType, CommissionSacrificePercentage
Training AUC: 0.8370311150533368


#### Cross Validation

In [None]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using RandomForestClassifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_train_auc_scores = []
        fold_test_auc_scores = []
        fold_train_accuracy_scores = []
        fold_test_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data using RandomForestClassifier
            model = RandomForestClassifier(random_state=42)
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)
            y_fold_test_pred = model.predict(X_test_scaled)

            # Calculate Training AUC, Test AUC, Training Accuracy, and Test Accuracy for the fold
            fold_train_auc = roc_auc_score(y_fold_train, model.predict(X_fold_train))
            fold_test_auc = roc_auc_score(y_test, y_fold_test_pred)
            fold_train_accuracy = accuracy_score(y_fold_train, model.predict(X_fold_train))
            fold_test_accuracy = accuracy_score(y_test, y_fold_test_pred)

            fold_train_auc_scores.append(fold_train_auc)
            fold_test_auc_scores.append(fold_test_auc)
            fold_train_accuracy_scores.append(fold_train_accuracy)
            fold_test_accuracy_scores.append(fold_test_accuracy)

        # Calculate average metrics for the current iteration's folds
        avg_train_auc = sum(fold_train_auc_scores) / num_folds
        avg_test_auc = sum(fold_test_auc_scores) / num_folds
        avg_train_accuracy = sum(fold_train_accuracy_scores) / num_folds
        avg_test_accuracy = sum(fold_test_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average Training AUC ({} folds):".format(num_folds), avg_train_auc)
        print("Average Test AUC ({} folds):".format(num_folds), avg_test_auc)
        print("Average Training Accuracy ({} folds):".format(num_folds), avg_train_accuracy)
        print("Average Test Accuracy ({} folds):".format(num_folds), avg_test_accuracy)
        print("---------------------------------------------")

Iteration 1: Using features Agency, WorkflowStatus_UWDecision
Training AUC: 0.7925293527228588
Test AUC: 0.7427118441593269
Training Accuracy: 0.792529352722859
Test Accuracy: 0.730430912058331
---------------------------------------------
Number of Folds: 5
Average Training AUC (5 folds): 0.7934330107667
Average Test AUC (5 folds): 0.742440586227963
Average Training Accuracy (5 folds): 0.7934338596769017
Average Test Accuracy (5 folds): 0.7291676381440895
---------------------------------------------
Number of Folds: 10
Average Training AUC (10 folds): 0.7929355203335834
Average Test AUC (10 folds): 0.7426240360285191
Average Training Accuracy (10 folds): 0.7929358946690157
Average Test Accuracy (10 folds): 0.7295809576294537
---------------------------------------------
Number of Folds: 20
Average Training AUC (20 folds): 0.79273139753433
Average Test AUC (20 folds): 0.7426065702546889
Average Training Accuracy (20 folds): 0.7927314463813776
Average Test Accuracy (20 folds): 0.729708

### LightGBM

#### 80/20 Split

In [None]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using LGBMClassifier
    model = lgb.LGBMClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

#### Cross Validation

In [None]:
# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using LGBMClassifier
    model = lgb.LGBMClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_train_auc_scores = []
        fold_test_auc_scores = []
        fold_train_accuracy_scores = []
        fold_test_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data using LGBMClassifier
            model = lgb.LGBMClassifier(random_state=42)
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)
            y_fold_test_pred = model.predict(X_test_scaled)

            # Calculate Training AUC, Test AUC, Training Accuracy, and Test Accuracy for the fold
            fold_train_auc = roc_auc_score(y_fold_train, model.predict(X_fold_train))
            fold_test_auc = roc_auc_score(y_test, y_fold_test_pred)
            fold_train_accuracy = accuracy_score(y_fold_train, model.predict(X_fold_train))
            fold_test_accuracy = accuracy_score(y_test, y_fold_test_pred)

            fold_train_auc_scores.append(fold_train_auc)
            fold_test_auc_scores.append(fold_test_auc)
            fold_train_accuracy_scores.append(fold_train_accuracy)
            fold_test_accuracy_scores.append(fold_test_accuracy)

        # Calculate average metrics for the current iteration's folds
        avg_train_auc = sum(fold_train_auc_scores) / num_folds
        avg_test_auc = sum(fold_test_auc_scores) / num_folds
        avg_train_accuracy = sum(fold_train_accuracy_scores) / num_folds
        avg_test_accuracy = sum(fold_test_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average Training AUC ({} folds):".format(num_folds), avg_train_auc)
        print("Average Test AUC ({} folds):".format(num_folds), avg_test_auc)
        print("Average Training Accuracy ({} folds):".format(num_folds), avg_train_accuracy)
        print("Average Test Accuracy ({} folds):".format(num_folds), avg_test_accuracy)
        print("---------------------------------------------")

#### 70/30 Split

In [None]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using LGBMClassifier
    model = lgb.LGBMClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

#### Cross Validation

In [None]:
# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using LGBMClassifier
    model = lgb.LGBMClassifier(random_state=42)
    model.fit(X_resampled, y_resampled)

    # Predict on the test data
    y_pred_test = model.predict(X_test_scaled)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
    test_auc = roc_auc_score(y_test, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_resampled, model.predict(X_resampled))
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

    # Iterate through different numbers of folds
    for num_folds in num_folds_list:
        print(f"Number of Folds: {num_folds}")
        
        # Initialise k-fold cross-validation
        kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

        # Lists to store evaluation metrics for each fold
        fold_train_auc_scores = []
        fold_test_auc_scores = []
        fold_train_accuracy_scores = []
        fold_test_accuracy_scores = []

        # Perform k-fold cross-validation
        for train_index, val_index in kf.split(X_resampled):
            X_fold_train, y_fold_train = X_resampled[train_index], y_resampled[train_index]
            X_fold_val, y_fold_val = X_resampled[val_index], y_resampled[val_index]

            # Train the model on the fold training data using LGBMClassifier
            model = lgb.LGBMClassifier(random_state=42)
            model.fit(X_fold_train, y_fold_train)

            # Predict on the fold validation data
            y_fold_val_pred = model.predict(X_fold_val)
            y_fold_test_pred = model.predict(X_test_scaled)

            # Calculate Training AUC, Test AUC, Training Accuracy, and Test Accuracy for the fold
            fold_train_auc = roc_auc_score(y_fold_train, model.predict(X_fold_train))
            fold_test_auc = roc_auc_score(y_test, y_fold_test_pred)
            fold_train_accuracy = accuracy_score(y_fold_train, model.predict(X_fold_train))
            fold_test_accuracy = accuracy_score(y_test, y_fold_test_pred)

            fold_train_auc_scores.append(fold_train_auc)
            fold_test_auc_scores.append(fold_test_auc)
            fold_train_accuracy_scores.append(fold_train_accuracy)
            fold_test_accuracy_scores.append(fold_test_accuracy)

        # Calculate average metrics for the current iteration's folds
        avg_train_auc = sum(fold_train_auc_scores) / num_folds
        avg_test_auc = sum(fold_test_auc_scores) / num_folds
        avg_train_accuracy = sum(fold_train_accuracy_scores) / num_folds
        avg_test_accuracy = sum(fold_test_accuracy_scores) / num_folds

        # Print average metrics for cross-validation
        print("Average Training AUC ({} folds):".format(num_folds), avg_train_auc)
        print("Average Test AUC ({} folds):".format(num_folds), avg_test_auc)
        print("Average Training Accuracy ({} folds):".format(num_folds), avg_train_accuracy)
        print("Average Test Accuracy ({} folds):".format(num_folds), avg_test_accuracy)
        print("---------------------------------------------")

## Neural Network

#### 8020 Split

Loop over epochs

In [None]:
# List of epochs to iterate over
epochs_list = [10, 20, 30]

# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using a neural network
    model = Sequential()
    model.add(Dense(units=64, activation='relu', input_dim=len(features)))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # Loop over epochs
    for epochs in epochs_list:
        print(f"Epochs: {epochs}")
        
        model.fit(X_resampled, y_resampled, epochs=epochs, batch_size=32, verbose=0)

        # Predict on the test data
        y_pred_train = np.round(model.predict(X_resampled)).astype(int)
        y_pred_test = np.round(model.predict(X_test_scaled)).astype(int)

        # Calculate Training AUC and Test AUC
        train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
        test_auc = roc_auc_score(y_test, y_pred_test)

        # Calculate Training Accuracy and Test Accuracy
        train_accuracy = accuracy_score(y_resampled, y_pred_train)
        test_accuracy = accuracy_score(y_test, y_pred_test)

        # Print evaluation metrics for training and test data
        print("Training AUC:", train_auc)
        print("Test AUC:", test_auc)
        print("Training Accuracy:", train_accuracy)
        print("Test Accuracy:", test_accuracy)
        print("---------------------------------------------")

#### 70/30 Split

In [None]:
# Iterate through feature sets
for i, features in enumerate(feature_sets, start=1):
    print(f"Iteration {i}: Using features {', '.join(features)}")
    
    # Extract the selected features and target variable
    X = df[features]
    y = df["PolicyIssued"]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scaling features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Applying SMOTE to the training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

    # Convert the resampled arrays back to a dataframe
    resampled_df = pd.DataFrame(X_resampled, columns=features)
    resampled_df["PolicyIssued"] = y_resampled

    # Train the model on the entire training data using a neural network
    model = Sequential()
    model.add(Dense(units=64, activation='relu', input_dim=len(features)))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # Loop over epochs
    for epochs in epochs_list:
        print(f"Epochs: {epochs}")
        
        model.fit(X_resampled, y_resampled, epochs=epochs, batch_size=32, verbose=0)

        # Predict on the test data
        y_pred_train = np.round(model.predict(X_resampled)).astype(int)
        y_pred_test = np.round(model.predict(X_test_scaled)).astype(int)

        # Calculate Training AUC and Test AUC
        train_auc = roc_auc_score(y_resampled, model.predict(X_resampled))
        test_auc = roc_auc_score(y_test, y_pred_test)

        # Calculate Training Accuracy and Test Accuracy
        train_accuracy = accuracy_score(y_resampled, y_pred_train)
        test_accuracy = accuracy_score(y_test, y_pred_test)

        # Print evaluation metrics for training and test data
        print("Training AUC:", train_auc)
        print("Test AUC:", test_auc)
        print("Training Accuracy:", train_accuracy)
        print("Test Accuracy:", test_accuracy)
        print("---------------------------------------------")