In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from imblearn.under_sampling import RandomUnderSampler

import lightgbm as lgb

from imblearn.over_sampling import SMOTE

from keras.models import Sequential
from keras.layers import Dense

import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
df = pd.read_excel('Dataset.xlsx')

## Clean Data

In [3]:
# Get the list of columns to convert to categorical
categorical_columns = df.select_dtypes(include='int64').columns.tolist()

# Convert the selected columns to categorical
df[categorical_columns] = df[categorical_columns].astype('category')

numeric_data = df.select_dtypes(include=[np.number])

# Calculate MAD for each column
mad = numeric_data.mad()

# Choose a threshold multiplier
k = 3

# Calculate the threshold value
threshold = k * mad

# Identify outliers
outliers = (np.abs(numeric_data - numeric_data.median()) > threshold)

# Apply logarithm to the specified columns
outlier_columns = ['CommissionSacrificePercentage', 'BonusCommissionPercentage']
for column in outlier_columns:
    df[column] = np.log1p(df[column])
    
df = df.drop(columns=['PropDate'])

### Undersampling

In [4]:
# Splitting the dataset into features (X) and target variable (y)
X = df.drop("PolicyIssued", axis=1)
y = df["PolicyIssued"]

# Splitting the data into training and testing sets
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train_70, X_test_70, y_train_70, y_test_70 = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Applying RandomUnderSampler to the training data
under_sampler = RandomUnderSampler(random_state=42)
X_resampled_80, y_resampled_80 = under_sampler.fit_resample(X_train_80, y_train_80)

X_resampled_70, y_resampled_70 = under_sampler.fit_resample(X_train_70, y_train_70)

# Convert the resampled arrays back to a dataframe
resampled_df_80 = pd.DataFrame(X_resampled_80, columns=X_train_80.columns)
resampled_df_80["PolicyIssued"] = y_resampled_80

resampled_df_70 = pd.DataFrame(X_resampled_70, columns=X_train_70.columns)
resampled_df_70["PolicyIssued"] = y_resampled_70

# Check the class distribution in the resampled dataset
print("Class distribution in resampled dataset:")
print(resampled_df_80["PolicyIssued"].value_counts())
print(resampled_df_70["PolicyIssued"].value_counts())

Class distribution in resampled dataset:
0    27888
1    27888
Name: PolicyIssued, dtype: int64
0    24402
1    24402
Name: PolicyIssued, dtype: int64


# All Features

## Logistic Regression

#### 80/20 Split

In [6]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# Initialise and train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test_80)

# Calculate Training AUC and Test AUC
train_auc = roc_auc_score(y_train, y_pred_train)
test_auc = roc_auc_score(y_test_80, y_pred_test)

# Calculate Training Accuracy and Test Accuracy
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test_80, y_pred_test)

# Evaluate the model
confusion_mat_train = confusion_matrix(y_train, y_pred_train)
class_report_train = classification_report(y_train, y_pred_train)

confusion_mat_test = confusion_matrix(y_test_80, y_pred_test)
class_report_test = classification_report(y_test_80, y_pred_test)

print("Training Confusion Matrix:\n", confusion_mat_train)
print("\nTraining Classification Report:\n", class_report_train)
print("\nTraining AUC:", train_auc)
print("Training Accuracy:", train_accuracy)
print("---------------------------------------------")

print("Test Confusion Matrix:\n", confusion_mat_test)
print("\nTest Classification Report:\n", class_report_test)
print("\nTest AUC:", test_auc)
print("Test Accuracy:", test_accuracy)
print("---------------------------------------------")

Training Confusion Matrix:
 [[24225  3663]
 [10523 17365]]

Training Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.87      0.77     27888
           1       0.83      0.62      0.71     27888

    accuracy                           0.75     55776
   macro avg       0.76      0.75      0.74     55776
weighted avg       0.76      0.75      0.74     55776


Training AUC: 0.7456612162937464
Training Accuracy: 0.7456612162937464
---------------------------------------------
Test Confusion Matrix:
 [[ 6084   888]
 [ 9255 15226]]

Test Classification Report:
               precision    recall  f1-score   support

           0       0.40      0.87      0.55      6972
           1       0.94      0.62      0.75     24481

    accuracy                           0.68     31453
   macro avg       0.67      0.75      0.65     31453
weighted avg       0.82      0.68      0.70     31453


Test AUC: 0.747292554182136
Test Accuracy: 0.6775

#### Cross Validation

In [7]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Initialise logistic regression model
model = LogisticRegression()

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")
    
    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_pred = model.predict(X_test_80)

    # Calculate AUC
    auc = roc_auc_score(y_test_80, y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_test_80, y_pred)

    # Print evaluation metrics for the test data
    print("Test AUC:", auc)
    print("Test Accuracy:", accuracy)
    print("---------------------------------------------")


Number of Folds: 5
Average AUC: 0.7458834867109965
Average Accuracy: 0.7459302626264834
---------------------------------------------
Test AUC: 0.747292554182136
Test Accuracy: 0.6775188376307506
---------------------------------------------
Number of Folds: 10
Average AUC: 0.745786560106483
Average Accuracy: 0.745876574722039
---------------------------------------------
Test AUC: 0.747292554182136
Test Accuracy: 0.6775188376307506
---------------------------------------------
Number of Folds: 20
Average AUC: 0.7456950875026024
Average Accuracy: 0.7458226955352887
---------------------------------------------
Test AUC: 0.747292554182136
Test Accuracy: 0.6775188376307506
---------------------------------------------


#### 70/30 Split

In [9]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# Initialise and train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test_70)

# Calculate Training AUC and Test AUC
train_auc = roc_auc_score(y_train, y_pred_train)
test_auc = roc_auc_score(y_test_70, y_pred_test)

# Calculate Training Accuracy and Test Accuracy
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test_70, y_pred_test)

# Evaluate the model
confusion_mat_train = confusion_matrix(y_train, y_pred_train)
class_report_train = classification_report(y_train, y_pred_train)

confusion_mat_test = confusion_matrix(y_test_70, y_pred_test)
class_report_test = classification_report(y_test_70, y_pred_test)

print("Training Confusion Matrix:\n", confusion_mat_train)
print("\nTraining Classification Report:\n", class_report_train)
print("\nTraining AUC:", train_auc)
print("Training Accuracy:", train_accuracy)
print("---------------------------------------------")

print("Test Confusion Matrix:\n", confusion_mat_test)
print("\nTest Classification Report:\n", class_report_test)
print("\nTest AUC:", test_auc)
print("Test Accuracy:", test_accuracy)
print("---------------------------------------------")

Training Confusion Matrix:
 [[21291  3111]
 [ 9339 15063]]

Training Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.87      0.77     24402
           1       0.83      0.62      0.71     24402

    accuracy                           0.74     48804
   macro avg       0.76      0.74      0.74     48804
weighted avg       0.76      0.74      0.74     48804


Training AUC: 0.7448979591836734
Training Accuracy: 0.7448979591836735
---------------------------------------------
Test Confusion Matrix:
 [[ 9152  1306]
 [14024 22697]]

Test Classification Report:
               precision    recall  f1-score   support

           0       0.39      0.88      0.54     10458
           1       0.95      0.62      0.75     36721

    accuracy                           0.68     47179
   macro avg       0.67      0.75      0.65     47179
weighted avg       0.82      0.68      0.70     47179


Test AUC: 0.7466063574526182
Test Accuracy: 0.675

#### Cross Validation

In [10]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Initialise logistic regression model
model = LogisticRegression()

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")
    
    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_pred = model.predict(X_test_70)

    # Calculate AUC
    auc = roc_auc_score(y_test_70, y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_test_70, y_pred)

    # Print evaluation metrics for the test data
    print("Test AUC:", auc)
    print("Test Accuracy:", accuracy)
    print("---------------------------------------------")


Number of Folds: 5
Average AUC: 0.7458834867109965
Average Accuracy: 0.7459302626264834
---------------------------------------------
Test AUC: 0.7463756986211882
Test Accuracy: 0.6767841624451557
---------------------------------------------
Number of Folds: 10
Average AUC: 0.745786560106483
Average Accuracy: 0.745876574722039
---------------------------------------------
Test AUC: 0.7463756986211882
Test Accuracy: 0.6767841624451557
---------------------------------------------
Number of Folds: 20
Average AUC: 0.7456950875026024
Average Accuracy: 0.7458226955352887
---------------------------------------------
Test AUC: 0.7463756986211882
Test Accuracy: 0.6767841624451557
---------------------------------------------


### Stochastic Gradient Descent

#### 80/20 Split

In [11]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# Initialise SGD classifier model
model = SGDClassifier(loss='log', random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test_80)
y_train_pred = model.predict(X_train)

# Calculate AUC and accuracy for test and training data
train_auc = roc_auc_score(y_train, y_train_pred)
test_auc = roc_auc_score(y_test_80, y_test_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test_80, y_test_pred)

# Print metrics for training and test data
print("Training AUC:", train_auc)
print("Test AUC:", test_auc)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("=============================================")


Training AUC: 0.7256167527251864
Test AUC: 0.7263481148036567
Training Accuracy: 0.7256167527251864
Test Accuracy: 0.6342161320064859


#### Cross Validation

In [12]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Initialise SGD classifier model
model = SGDClassifier(loss='log', random_state=42)

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")

    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_test_pred = model.predict(X_test_80)
    y_train_pred = model.predict(X_train)

    # Calculate AUC and accuracy for test and training data
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test_80, y_test_pred)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test_80, y_test_pred)

    # Print metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


Number of Folds: 5
Average AUC: 0.6276471189305945
Average Accuracy: 0.624946552369485
---------------------------------------------
Training AUC: 0.7256167527251864
Test AUC: 0.7263481148036567
Training Accuracy: 0.7256167527251864
Test Accuracy: 0.6342161320064859
Number of Folds: 10
Average AUC: 0.5988473069290688
Average Accuracy: 0.6008542776049741
---------------------------------------------
Training AUC: 0.7256167527251864
Test AUC: 0.7263481148036567
Training Accuracy: 0.7256167527251864
Test Accuracy: 0.6342161320064859
Number of Folds: 20
Average AUC: 0.6202035178983514
Average Accuracy: 0.6210137772238034
---------------------------------------------
Training AUC: 0.7256167527251864
Test AUC: 0.7263481148036567
Training Accuracy: 0.7256167527251864
Test Accuracy: 0.6342161320064859


#### 70/30 Split

In [13]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# Initialise SGD classifier model
model = SGDClassifier(loss='log', random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test_70)
y_train_pred = model.predict(X_train)

# Calculate AUC and accuracy for test and training data
train_auc = roc_auc_score(y_train, y_train_pred)
test_auc = roc_auc_score(y_test_70, y_test_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test_70, y_test_pred)

# Print metrics for training and test data
print("Training AUC:", train_auc)
print("Test AUC:", test_auc)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("=============================================")


Training AUC: 0.5173141545774937
Test AUC: 0.5172252420263554
Training Accuracy: 0.5173141545774936
Test Accuracy: 0.7671421607070943


#### Cross Validation

In [14]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Initialise SGD classifier model
model = SGDClassifier(loss='log', random_state=42)

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")

    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_test_pred = model.predict(X_test_70)
    y_train_pred = model.predict(X_train)

    # Calculate AUC and accuracy for test and training data
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test_70, y_test_pred)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test_70, y_test_pred)

    # Print metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


Number of Folds: 5
Average AUC: 0.7089184576138878
Average Accuracy: 0.7082410323955656
---------------------------------------------
Training AUC: 0.5173141545774937
Test AUC: 0.5172252420263554
Training Accuracy: 0.5173141545774936
Test Accuracy: 0.7671421607070943
Number of Folds: 10
Average AUC: 0.6554717402814798
Average Accuracy: 0.6559528667533191
---------------------------------------------
Training AUC: 0.5173141545774937
Test AUC: 0.5172252420263554
Training Accuracy: 0.5173141545774936
Test Accuracy: 0.7671421607070943
Number of Folds: 20
Average AUC: 0.654699152850934
Average Accuracy: 0.6566492921471313
---------------------------------------------
Training AUC: 0.5173141545774937
Test AUC: 0.5172252420263554
Training Accuracy: 0.5173141545774936
Test Accuracy: 0.7671421607070943


### Decision Tree

#### 80/20 Split

In [15]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]


# Initialise decision tree classifier model
model = DecisionTreeClassifier()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test_80)
y_train_pred = model.predict(X_train)

# Calculate AUC and accuracy for test and training data
train_auc = roc_auc_score(y_train, y_train_pred)
test_auc = roc_auc_score(y_test_80, y_test_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test_80, y_test_pred)

# Print metrics for training and test data
print("Training AUC:", train_auc)
print("Test AUC:", test_auc)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("=============================================")


Training AUC: 0.8793567125645438
Test AUC: 0.7078332440793887
Training Accuracy: 0.8793567125645438
Test Accuracy: 0.6830826948144851


#### Cross-Validation

In [16]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# Initialise decision tree classifier model
model = DecisionTreeClassifier()

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")

    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_test_pred = model.predict(X_test_80)
    y_train_pred = model.predict(X_train)

    # Calculate AUC and accuracy for test and training data
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test_80, y_test_pred)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test_80, y_test_pred)

    # Print metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


Number of Folds: 5
Average AUC: 0.6964768287083458
Average Accuracy: 0.6965362547589228
---------------------------------------------
Training AUC: 0.8793567125645438
Test AUC: 0.7080883126828275
Training Accuracy: 0.8793567125645438
Test Accuracy: 0.6835595968588052
Number of Folds: 10
Average AUC: 0.6985053196590707
Average Accuracy: 0.6985443466812582
---------------------------------------------
Training AUC: 0.8793567125645438
Test AUC: 0.70782187495247
Training Accuracy: 0.8793567125645438
Test Accuracy: 0.6834642164499412
Number of Folds: 20
Average AUC: 0.6986879735957178
Average Accuracy: 0.6987593129495717
---------------------------------------------
Training AUC: 0.8793567125645438
Test AUC: 0.7063884919898656
Training Accuracy: 0.8793567125645438
Test Accuracy: 0.6825104123613009


#### 70/30 Split

In [17]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]


# Initialise decision tree classifier model
model = DecisionTreeClassifier()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test_70)
y_train_pred = model.predict(X_train)

# Calculate AUC and accuracy for test and training data
train_auc = roc_auc_score(y_train, y_train_pred)
test_auc = roc_auc_score(y_test_70, y_test_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test_70, y_test_pred)

# Print metrics for training and test data
print("Training AUC:", train_auc)
print("Test AUC:", test_auc)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("=============================================")


Training AUC: 0.8844561921154003
Test AUC: 0.6996694263753294
Training Accuracy: 0.8844561921154004
Test Accuracy: 0.673435214820153


#### Cross Validation

In [18]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# Initialise decision tree classifier model
model = DecisionTreeClassifier()

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")

    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_test_pred = model.predict(X_test_70)
    y_train_pred = model.predict(X_train)

    # Calculate AUC and accuracy for test and training data
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test_70, y_test_pred)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test_70, y_test_pred)

    # Print metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


Number of Folds: 5
Average AUC: 0.7011735211765058
Average Accuracy: 0.7011721853108976
---------------------------------------------
Training AUC: 0.8844561921154003
Test AUC: 0.6998137790489135
Training Accuracy: 0.8844561921154004
Test Accuracy: 0.6731808643676211
Number of Folds: 10
Average AUC: 0.7006441573612532
Average Accuracy: 0.7006394357847927
---------------------------------------------
Training AUC: 0.8844561921154003
Test AUC: 0.6997308528510268
Training Accuracy: 0.8844561921154004
Test Accuracy: 0.6734776065622416
Number of Folds: 20
Average AUC: 0.7009143267449531
Average Accuracy: 0.7009675975984042
---------------------------------------------
Training AUC: 0.8844561921154003
Test AUC: 0.7004721668656131
Training Accuracy: 0.8844561921154004
Test Accuracy: 0.6748977299222112


### Random Forest

#### 80/20 Split

In [19]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# Initialise Random Forest classifier model
model = RandomForestClassifier()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test_80)
y_train_pred = model.predict(X_train)

# Calculate AUC and accuracy for test and training data
train_auc = roc_auc_score(y_train, y_train_pred)
test_auc = roc_auc_score(y_test_80, y_test_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test_80, y_test_pred)

# Print metrics for training and test data
print("Training AUC:", train_auc)
print("Test AUC:", test_auc)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("=============================================")


Training AUC: 0.8793208548479633
Test AUC: 0.7193656604863378
Training Accuracy: 0.8793208548479633
Test Accuracy: 0.707023177439354


#### Cross Validation

In [20]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# Initialise decision tree classifier model
model = DecisionTreeClassifier()

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")

    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_test_pred = model.predict(X_test_80)
    y_train_pred = model.predict(X_train)

    # Calculate AUC and accuracy for test and training data
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test_80, y_test_pred)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test_80, y_test_pred)

    # Print metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


Number of Folds: 5
Average AUC: 0.6968974908712913
Average Accuracy: 0.6969665165014829
---------------------------------------------
Training AUC: 0.8793567125645438
Test AUC: 0.7087451207081971
Training Accuracy: 0.8793567125645438
Test Accuracy: 0.6834642164499412
Number of Folds: 10
Average AUC: 0.6984788541782001
Average Accuracy: 0.6985084433177216
---------------------------------------------
Training AUC: 0.8793567125645438
Test AUC: 0.7081514360909298
Training Accuracy: 0.8793567125645438
Test Accuracy: 0.6830191078752424
Number of Folds: 20
Average AUC: 0.6988723939976013
Average Accuracy: 0.6989565676903473
---------------------------------------------
Training AUC: 0.8793567125645438
Test AUC: 0.7074038771810415
Training Accuracy: 0.8793567125645438
Test Accuracy: 0.6825739993005436


#### 70/30 Split

In [21]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# Initialise Random Forest classifier model
model = RandomForestClassifier()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test_70)
y_train_pred = model.predict(X_train)

# Calculate AUC and accuracy for test and training data
train_auc = roc_auc_score(y_train, y_train_pred)
test_auc = roc_auc_score(y_test_70, y_test_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test_70, y_test_pred)

# Print metrics for training and test data
print("Training AUC:", train_auc)
print("Test AUC:", test_auc)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("=============================================")


Training AUC: 0.8844152118678797
Test AUC: 0.7081115495007714
Training Accuracy: 0.8844152118678796
Test Accuracy: 0.6983403632972297


#### Cross Validation

In [22]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# Initialise decision tree classifier model
model = DecisionTreeClassifier()

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")

    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_test_pred = model.predict(X_test_70)
    y_train_pred = model.predict(X_train)

    # Calculate AUC and accuracy for test and training data
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test_70, y_test_pred)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test_70, y_test_pred)

    # Print metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


Number of Folds: 5
Average AUC: 0.7003330038543739
Average Accuracy: 0.7003320843571188
---------------------------------------------
Training AUC: 0.8844561921154003
Test AUC: 0.6998606675304261
Training Accuracy: 0.8844561921154004
Test Accuracy: 0.6735199983043303
Number of Folds: 10
Average AUC: 0.700855899920979
Average Accuracy: 0.7008443160330623
---------------------------------------------
Training AUC: 0.8844561921154003
Test AUC: 0.6998733619100874
Training Accuracy: 0.8844561921154004
Test Accuracy: 0.6738591322410394
Number of Folds: 20
Average AUC: 0.7010076282977952
Average Accuracy: 0.7010495732063586
---------------------------------------------
Training AUC: 0.8844561921154003
Test AUC: 0.7001941104754964
Training Accuracy: 0.8844561921154004
Test Accuracy: 0.674092286822527


### LightGBM

#### 80/20 Split

In [23]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# Initialise LightGBM classifier model
model = lgb.LGBMClassifier(random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test_80)
y_train_pred = model.predict(X_train)

# Calculate AUC and accuracy for test and training data
train_auc = roc_auc_score(y_train, y_train_pred)
test_auc = roc_auc_score(y_test_80, y_test_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test_80, y_test_pred)

# Print metrics for training and test data
print("Training AUC:", train_auc)
print("Test AUC:", test_auc)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("=============================================")


Training AUC: 0.7796722604704532
Test AUC: 0.7541332386212704
Training Accuracy: 0.7796722604704532
Test Accuracy: 0.7203446412106953


#### Cross Validation

In [24]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# Initialise LightGBM classifier model
model = lgb.LGBMClassifier(random_state=42)

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")

    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Initialise LightGBM classifier model
        model = lgb.LGBMClassifier(random_state=42)

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_test_pred = model.predict(X_test_80)
    y_train_pred = model.predict(X_train)

    # Calculate AUC and accuracy for test and training data
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test_80, y_test_pred)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test_80, y_test_pred)

    # Print metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


Number of Folds: 5
Average AUC: 0.751098557229944
Average Accuracy: 0.7510936896069418
---------------------------------------------
Training AUC: 0.7796722604704532
Test AUC: 0.7541332386212704
Training Accuracy: 0.7796722604704532
Test Accuracy: 0.7203446412106953
Number of Folds: 10
Average AUC: 0.7504231258724677
Average Accuracy: 0.7504482182461607
---------------------------------------------
Training AUC: 0.7796722604704532
Test AUC: 0.7541332386212704
Training Accuracy: 0.7796722604704532
Test Accuracy: 0.7203446412106953
Number of Folds: 20
Average AUC: 0.7513976071828362
Average Accuracy: 0.7514700545234841
---------------------------------------------
Training AUC: 0.7796722604704532
Test AUC: 0.7541332386212704
Training Accuracy: 0.7796722604704532
Test Accuracy: 0.7203446412106953


#### 70/30 Split

In [25]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# Initialise LightGBM classifier model
model = lgb.LGBMClassifier(random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test_70)
y_train_pred = model.predict(X_train)

# Calculate AUC and accuracy for test and training data
train_auc = roc_auc_score(y_train, y_train_pred)
test_auc = roc_auc_score(y_test_70, y_test_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test_70, y_test_pred)

# Print metrics for training and test data
print("Training AUC:", train_auc)
print("Test AUC:", test_auc)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("=============================================")


Training AUC: 0.7789730349971314
Test AUC: 0.7549012296799502
Training Accuracy: 0.7789730349971313
Test Accuracy: 0.7248140062315861


#### Cross Validation

In [26]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# Initialise LightGBM classifier model
model = lgb.LGBMClassifier(random_state=42)

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")

    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Initialise LightGBM classifier model
        model = lgb.LGBMClassifier(random_state=42)

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_test_pred = model.predict(X_test_70)
    y_train_pred = model.predict(X_train)

    # Calculate AUC and accuracy for test and training data
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test_70, y_test_pred)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test_70, y_test_pred)

    # Print metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


Number of Folds: 5
Average AUC: 0.7502678757255858
Average Accuracy: 0.7502665152052078
---------------------------------------------
Training AUC: 0.7789730349971314
Test AUC: 0.7549012296799502
Training Accuracy: 0.7789730349971313
Test Accuracy: 0.7248140062315861
Number of Folds: 10
Average AUC: 0.750799771204703
Average Accuracy: 0.7507788060764222
---------------------------------------------
Training AUC: 0.7789730349971314
Test AUC: 0.7549012296799502
Training Accuracy: 0.7789730349971313
Test Accuracy: 0.7248140062315861
Number of Folds: 20
Average AUC: 0.7506424820595379
Average Accuracy: 0.7506967381011547
---------------------------------------------
Training AUC: 0.7789730349971314
Test AUC: 0.7549012296799502
Training Accuracy: 0.7789730349971313
Test Accuracy: 0.7248140062315861


## Neural Network

#### 8020 Split

Loop over epochs

In [27]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# List of epochs to iterate over
epochs_list = [10, 20, 30] 

# Create empty lists to store results
results = []

for epochs in epochs_list:
    print(f" Epochs: {epochs}")

    # Initialise a neural network model
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=32, verbose=1, validation_split=0.2)

    # Evaluate the model on the separate test data
    test_loss, test_accuracy = model.evaluate(X_test_80, y_test_80, verbose=0)

    # Print metrics for test data
    print("Test Loss:", test_loss)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


 Epochs: 10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.5766180753707886
Test Accuracy: 0.6811751127243042
 Epochs: 20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Loss: 0.7765907645225525
Test Accuracy: 0.6449941396713257
 Epochs: 30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30


Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Loss: 0.6783196926116943
Test Accuracy: 0.6749117970466614


#### 70/30 Split

In [28]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# List of epochs to iterate over
epochs_list = [10, 20, 30] 

# Create empty lists to store results
results = []

for epochs in epochs_list:
    print(f" Epochs: {epochs}")

    # Initialise a neural network model
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=32, verbose=1, validation_split=0.2)

    # Evaluate the model on the separate test data
    test_loss, test_accuracy = model.evaluate(X_test_70, y_test_70, verbose=0)

    # Print metrics for test data
    print("Test Loss:", test_loss)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


 Epochs: 10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.5429829359054565
Test Accuracy: 0.6856016516685486
 Epochs: 20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Loss: 0.7943688631057739
Test Accuracy: 0.6491235494613647
 Epochs: 30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30


Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Loss: 0.7099330425262451
Test Accuracy: 0.670997679233551


In [29]:
# List of epochs to iterate over
epochs_list = [10, 20, 30]
    
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# Train the model on the entire training data using a neural network
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=len(X.columns)))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Loop over epochs
for epochs in epochs_list:
    print(f"Epochs: {epochs}")

    model.fit(X_train, y_train, epochs=epochs, batch_size=32, verbose=0)

    # Predict on the test data
    y_pred_train = np.round(model.predict(X_train)).astype(int)
    y_pred_test = np.round(model.predict(X_test_80)).astype(int)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_train, model.predict(X_train))
    test_auc = roc_auc_score(y_test_80, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test_80, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Epochs: 10
Training AUC: 0.7889761222220659
Test AUC: 0.7479393816315171
Training Accuracy: 0.7464859437751004
Test Accuracy: 0.6773280768130227
---------------------------------------------
Epochs: 20
Training AUC: 0.7901123501155676
Test AUC: 0.73708744306326
Training Accuracy: 0.736499569707401
Test Accuracy: 0.7085810574507996
---------------------------------------------
Epochs: 30
Training AUC: 0.7877462654177895
Test AUC: 0.7393334124748775
Training Accuracy: 0.7389020367183018
Test Accuracy: 0.6594601468858297
---------------------------------------------


In [30]:
# List of epochs to iterate over
epochs_list = [10, 20, 30]
    
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# Train the model on the entire training data using a neural network
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=len(X.columns)))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Loop over epochs
for epochs in epochs_list:
    print(f"Epochs: {epochs}")

    model.fit(X_train, y_train, epochs=epochs, batch_size=32, verbose=0)

    # Predict on the test data
    y_pred_train = np.round(model.predict(X_train)).astype(int)
    y_pred_test = np.round(model.predict(X_test_70)).astype(int)

    # Calculate Training AUC and Test AUC
    train_auc = roc_auc_score(y_train, model.predict(X_train))
    test_auc = roc_auc_score(y_test_70, y_pred_test)

    # Calculate Training Accuracy and Test Accuracy
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test_70, y_pred_test)

    # Print evaluation metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("---------------------------------------------")

Epochs: 10
Training AUC: 0.7296809203229185
Test AUC: 0.5594156638249952
Training Accuracy: 0.5610195885583149
Test Accuracy: 0.7523898344602471
---------------------------------------------
Epochs: 20
Training AUC: 0.7888851361448059
Test AUC: 0.680301119174529
Training Accuracy: 0.6808458323088271
Test Accuracy: 0.760974162233197
---------------------------------------------
Epochs: 30
Training AUC: 0.7968319260895693
Test AUC: 0.7472703724599737
Training Accuracy: 0.7461478567330546
Test Accuracy: 0.6795608215519616
---------------------------------------------
