In [32]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier

import lightgbm as lgb

from imblearn.over_sampling import BorderlineSMOTE

from keras.models import Sequential
from keras.layers import Dense

import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [22]:
df = pd.read_excel('Dataset.xlsx')

## Clean Data

In [23]:
# Get the list of columns to convert to categorical
categorical_columns = df.select_dtypes(include='int64').columns.tolist()

# Convert the selected columns to categorical
df[categorical_columns] = df[categorical_columns].astype('category')

numeric_data = df.select_dtypes(include=[np.number])

# Calculate MAD for each column
mad = numeric_data.mad()

# Choose a threshold multiplier
k = 3

# Calculate the threshold value
threshold = k * mad

# Identify outliers
outliers = (np.abs(numeric_data - numeric_data.median()) > threshold)

# Apply logarithm to the specified columns
outlier_columns = ['CommissionSacrificePercentage', 'BonusCommissionPercentage']
for column in outlier_columns:
    df[column] = np.log1p(df[column])
    
df = df.drop(columns=['PropDate'])

### BorderlineSMOTE

In [24]:
# Splitting the dataset into features (X) and target variable (y)
X = df.drop("PolicyIssued", axis=1)
y = df["PolicyIssued"]

# Splitting the data into training and testing sets
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train_70, X_test_70, y_train_70, y_test_70 = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scaling features
scaler = StandardScaler()
X_train_scaled_80 = scaler.fit_transform(X_train_80)
X_test_scaled_80 = scaler.transform(X_test_80)

X_train_scaled_70 = scaler.fit_transform(X_train_70)
X_test_scaled_70 = scaler.transform(X_test_70)

# Applying BorderlineSMOTE to the training data
borderline_smote = BorderlineSMOTE(random_state=42)
X_resampled_80, y_resampled_80 = smote.fit_resample(X_train_scaled_80, y_train_80)

X_resampled_70, y_resampled_70 = smote.fit_resample(X_train_scaled_70, y_train_70)

# Convert the resampled arrays back to a dataframe
resampled_df_80 = pd.DataFrame(X_resampled_80, columns=X_train_80.columns)
resampled_df_80["PolicyIssued"] = y_resampled_80

resampled_df_70 = pd.DataFrame(X_resampled_70, columns=X_train_70.columns)
resampled_df_70["PolicyIssued"] = y_resampled_70

# Check the class distribution in the resampled dataset
print("Class distribution in resampled dataset:")
print(resampled_df_80["PolicyIssued"].value_counts())
print(resampled_df_70["PolicyIssued"].value_counts())

Class distribution in resampled dataset:
0    97922
1    97922
Name: PolicyIssued, dtype: int64
0    85682
1    85682
Name: PolicyIssued, dtype: int64


# All Features

## Logistic Regression

#### 80/20 Split

In [25]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# Initialise and train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled_80)

# Calculate AUC
auc = roc_auc_score(y_test_80, y_pred)

# Calculate accuracy
accuracy = accuracy_score(y_test_80, y_pred)

# Evaluate the model
confusion_mat = confusion_matrix(y_test_80, y_pred)
class_report = classification_report(y_test_80, y_pred)

# Evaluate the model
confusion_mat = confusion_matrix(y_test_80, y_pred)
class_report = classification_report(y_test_80, y_pred)

print("Confusion Matrix:\n", confusion_mat)
print("\nClassification Report:\n", class_report)
print("\nAUC:", auc)
print("Accuracy:", accuracy)
print("---------------------------------------------")

Confusion Matrix:
 [[ 6088   884]
 [ 9179 15302]]

Classification Report:
               precision    recall  f1-score   support

           0       0.40      0.87      0.55      6972
           1       0.95      0.63      0.75     24481

    accuracy                           0.68     31453
   macro avg       0.67      0.75      0.65     31453
weighted avg       0.82      0.68      0.71     31453


AUC: 0.7491316400886301
Accuracy: 0.6800623152004578
---------------------------------------------




#### Cross Validation

In [29]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Initialise logistic regression model
model = LogisticRegression()

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")
    
    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_pred = model.predict(X_test_scaled_80)

    # Calculate AUC
    auc = roc_auc_score(y_test_80, y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_test_80, y_pred)

    # Print evaluation metrics for the test data
    print("Test AUC:", auc)
    print("Test Accuracy:", accuracy)
    print("---------------------------------------------")


Number of Folds: 5
Average AUC: 0.7451126129884722
Average Accuracy: 0.7450981408992405
---------------------------------------------




Test AUC: 0.7491316400886301
Test Accuracy: 0.6800623152004578
---------------------------------------------
Number of Folds: 10
Average AUC: 0.7452153304382247
Average Accuracy: 0.7452053720709627
---------------------------------------------




Test AUC: 0.7491316400886301
Test Accuracy: 0.6800623152004578
---------------------------------------------
Number of Folds: 20
Average AUC: 0.7451369120351143
Average Accuracy: 0.7451288328948448
---------------------------------------------
Test AUC: 0.7491316400886301
Test Accuracy: 0.6800623152004578
---------------------------------------------




#### 70/30 Split

In [27]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# Initialise and train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled_70)

# Calculate AUC
auc = roc_auc_score(y_test_70, y_pred)

# Calculate accuracy
accuracy = accuracy_score(y_test_70, y_pred)

# Evaluate the model
confusion_mat = confusion_matrix(y_test_70, y_pred)
class_report = classification_report(y_test_70, y_pred)

# Evaluate the model
confusion_mat = confusion_matrix(y_test_70, y_pred)
class_report = classification_report(y_test_70, y_pred)

print("Confusion Matrix:\n", confusion_mat)
print("\nClassification Report:\n", class_report)
print("\nAUC:", auc)
print("Accuracy:", accuracy)
print("---------------------------------------------")

Confusion Matrix:
 [[ 9114  1344]
 [13859 22862]]

Classification Report:
               precision    recall  f1-score   support

           0       0.40      0.87      0.55     10458
           1       0.94      0.62      0.75     36721

    accuracy                           0.68     47179
   macro avg       0.67      0.75      0.65     47179
weighted avg       0.82      0.68      0.70     47179


AUC: 0.747036237321498
Accuracy: 0.6777591725131944
---------------------------------------------




#### Cross Validation

In [30]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Initialise logistic regression model
model = LogisticRegression()

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")
    
    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_pred = model.predict(X_test_scaled_70)

    # Calculate AUC
    auc = roc_auc_score(y_test_70, y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_test_70, y_pred)

    # Print evaluation metrics for the test data
    print("Test AUC:", auc)
    print("Test Accuracy:", accuracy)
    print("---------------------------------------------")


Number of Folds: 5
Average AUC: 0.7451126129884722
Average Accuracy: 0.7450981408992405
---------------------------------------------




Test AUC: 0.7479896776751963
Test Accuracy: 0.6791369041310753
---------------------------------------------
Number of Folds: 10
Average AUC: 0.7452153304382247
Average Accuracy: 0.7452053720709627
---------------------------------------------




Test AUC: 0.7479896776751963
Test Accuracy: 0.6791369041310753
---------------------------------------------
Number of Folds: 20
Average AUC: 0.7451369120351143
Average Accuracy: 0.7451288328948448
---------------------------------------------
Test AUC: 0.7479896776751963
Test Accuracy: 0.6791369041310753
---------------------------------------------




### Stochastic Gradient Descent

#### 80/20 Split

In [33]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# Initialise SGD classifier model
model = SGDClassifier(loss='log', random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test_scaled_80)
y_train_pred = model.predict(X_train)

# Calculate AUC and accuracy for test and training data
train_auc = roc_auc_score(y_train, y_train_pred)
test_auc = roc_auc_score(y_test_80, y_test_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test_80, y_test_pred)

# Print metrics for training and test data
print("Training AUC:", train_auc)
print("Test AUC:", test_auc)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("=============================================")


Training AUC: 0.7441381916219032
Test AUC: 0.7481123499641426
Training Accuracy: 0.7441381916219032
Test Accuracy: 0.6848631291132802




#### Cross Validation

In [34]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Initialise SGD classifier model
model = SGDClassifier(loss='log', random_state=42)

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")

    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_test_pred = model.predict(X_test_scaled_80)
    y_train_pred = model.predict(X_train)

    # Calculate AUC and accuracy for test and training data
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test_80, y_test_pred)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test_80, y_test_pred)

    # Print metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


Number of Folds: 5
Average AUC: 0.7438552938226421
Average Accuracy: 0.7438165139504831
---------------------------------------------




Training AUC: 0.7441381916219032
Test AUC: 0.7481123499641426
Training Accuracy: 0.7441381916219032
Test Accuracy: 0.6848631291132802
Number of Folds: 10
Average AUC: 0.7430257612809805
Average Accuracy: 0.7430097096972139
---------------------------------------------




Training AUC: 0.7441381916219032
Test AUC: 0.7481123499641426
Training Accuracy: 0.7441381916219032
Test Accuracy: 0.6848631291132802
Number of Folds: 20
Average AUC: 0.7436270408590044
Average Accuracy: 0.7436378667502263
---------------------------------------------
Training AUC: 0.7441381916219032
Test AUC: 0.7481123499641426
Training Accuracy: 0.7441381916219032
Test Accuracy: 0.6848631291132802




#### 70/30 Split

In [35]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# Initialise SGD classifier model
model = SGDClassifier(loss='log', random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test_scaled_70)
y_train_pred = model.predict(X_train)

# Calculate AUC and accuracy for test and training data
train_auc = roc_auc_score(y_train, y_train_pred)
test_auc = roc_auc_score(y_test_70, y_test_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test_70, y_test_pred)

# Print metrics for training and test data
print("Training AUC:", train_auc)
print("Test AUC:", test_auc)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("=============================================")


Training AUC: 0.7430440465908825
Test AUC: 0.744983151212081
Training Accuracy: 0.7430440465908825
Test Accuracy: 0.6829733567900973




#### Cross Validation

In [37]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# List of k values for k-fold cross-validation
num_folds_list = [5, 10, 20]

# Initialise SGD classifier model
model = SGDClassifier(loss='log', random_state=42)

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")

    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_test_pred = model.predict(X_test_scaled_70)
    y_train_pred = model.predict(X_train)

    # Calculate AUC and accuracy for test and training data
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test_70, y_test_pred)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test_70, y_test_pred)

    # Print metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


Number of Folds: 5
Average AUC: 0.7446670782625764
Average Accuracy: 0.7446780083861709
---------------------------------------------




Training AUC: 0.7430440465908825
Test AUC: 0.744983151212081
Training Accuracy: 0.7430440465908825
Test Accuracy: 0.6829733567900973
Number of Folds: 10
Average AUC: 0.7442356886883358
Average Accuracy: 0.7442228423142612
---------------------------------------------




Training AUC: 0.7430440465908825
Test AUC: 0.744983151212081
Training Accuracy: 0.7430440465908825
Test Accuracy: 0.6829733567900973
Number of Folds: 20
Average AUC: 0.7447565050471809
Average Accuracy: 0.7447713889305675
---------------------------------------------
Training AUC: 0.7430440465908825
Test AUC: 0.744983151212081
Training Accuracy: 0.7430440465908825
Test Accuracy: 0.6829733567900973




### Decision Tree

#### 80/20 Split

In [38]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]


# Initialise decision tree classifier model
model = DecisionTreeClassifier()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test_scaled_80)
y_train_pred = model.predict(X_train)

# Calculate AUC and accuracy for test and training data
train_auc = roc_auc_score(y_train, y_train_pred)
test_auc = roc_auc_score(y_test_80, y_test_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test_80, y_test_pred)

# Print metrics for training and test data
print("Training AUC:", train_auc)
print("Test AUC:", test_auc)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("=============================================")


Training AUC: 0.8993076121811238
Test AUC: 0.699421918125272
Training Accuracy: 0.8993076121811238
Test Accuracy: 0.7392935491050139




#### Cross-Validation

In [39]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# Initialise decision tree classifier model
model = DecisionTreeClassifier()

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")

    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_test_pred = model.predict(X_test_scaled_80)
    y_train_pred = model.predict(X_train)

    # Calculate AUC and accuracy for test and training data
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test_80, y_test_pred)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test_80, y_test_pred)

    # Print metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


Number of Folds: 5
Average AUC: 0.7750413735274513
Average Accuracy: 0.7750250278425215
---------------------------------------------




Training AUC: 0.8993076121811238
Test AUC: 0.6991559432452247
Training Accuracy: 0.8993076121811238
Test Accuracy: 0.7390392013480431
Number of Folds: 10
Average AUC: 0.7783186258897432
Average Accuracy: 0.7783031364351969
---------------------------------------------




Training AUC: 0.8993076121811238
Test AUC: 0.6997287410098945
Training Accuracy: 0.8993076121811238
Test Accuracy: 0.7396114838012272
Number of Folds: 20
Average AUC: 0.779328682916667
Average Accuracy: 0.7793243308462295
---------------------------------------------
Training AUC: 0.8993076121811238
Test AUC: 0.6996584141276633
Training Accuracy: 0.8993076121811238
Test Accuracy: 0.7391027882872858




#### 70/30 Split

In [41]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]


# Initialise decision tree classifier model
model = DecisionTreeClassifier()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test_scaled_70)
y_train_pred = model.predict(X_train)

# Calculate AUC and accuracy for test and training data
train_auc = roc_auc_score(y_train, y_train_pred)
test_auc = roc_auc_score(y_test_70, y_test_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test_70, y_test_pred)

# Print metrics for training and test data
print("Training AUC:", train_auc)
print("Test AUC:", test_auc)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("=============================================")


Training AUC: 0.9038771270511893
Test AUC: 0.6940507663423837
Training Accuracy: 0.9038771270511893
Test Accuracy: 0.7337798596833337




#### Cross Validation

In [44]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# Initialise decision tree classifier model
model = DecisionTreeClassifier()

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")

    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_test_pred = model.predict(X_test_scaled_70)
    y_train_pred = model.predict(X_train)

    # Calculate AUC and accuracy for test and training data
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test_70, y_test_pred)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test_70, y_test_pred)

    # Print metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


Number of Folds: 5
Average AUC: 0.7770668258652903
Average Accuracy: 0.7770710199095345
---------------------------------------------




Training AUC: 0.9038771270511893
Test AUC: 0.6946023755473094
Training Accuracy: 0.9038771270511893
Test Accuracy: 0.7345853027830178
Number of Folds: 10
Average AUC: 0.7800169030557255
Average Accuracy: 0.780017975708694
---------------------------------------------




Training AUC: 0.9038771270511893
Test AUC: 0.6947119195808679
Training Accuracy: 0.9038771270511893
Test Accuracy: 0.7345429110409293
Number of Folds: 20
Average AUC: 0.7807326886508663
Average Accuracy: 0.7807357529894909
---------------------------------------------
Training AUC: 0.9038771270511893
Test AUC: 0.6947930021642316
Training Accuracy: 0.9038771270511893
Test Accuracy: 0.7348820449776383




### Random Forest

#### 80/20 Split

In [46]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# Initialise Random Forest classifier model
model = RandomForestClassifier()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test_scaled_80)
y_train_pred = model.predict(X_train)

# Calculate AUC and accuracy for test and training data
train_auc = roc_auc_score(y_train, y_train_pred)
test_auc = roc_auc_score(y_test_80, y_test_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test_80, y_test_pred)

# Print metrics for training and test data
print("Training AUC:", train_auc)
print("Test AUC:", test_auc)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("=============================================")




Training AUC: 0.8992514450276751
Test AUC: 0.7010449906203091
Training Accuracy: 0.8992514450276751
Test Accuracy: 0.7490859377483865


#### Cross Validation

In [47]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# Initialise decision tree classifier model
model = DecisionTreeClassifier()

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")

    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_test_pred = model.predict(X_test_scaled_80)
    y_train_pred = model.predict(X_train)

    # Calculate AUC and accuracy for test and training data
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test_80, y_test_pred)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test_80, y_test_pred)

    # Print metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


Number of Folds: 5
Average AUC: 0.775262150360527
Average Accuracy: 0.7752445870141736
---------------------------------------------




Training AUC: 0.8993076121811238
Test AUC: 0.6996688575539618
Training Accuracy: 0.8993076121811238
Test Accuracy: 0.7390392013480431
Number of Folds: 10
Average AUC: 0.7777650235268664
Average Accuracy: 0.777751671843531
---------------------------------------------




Training AUC: 0.8993076121811238
Test AUC: 0.6990324735308797
Training Accuracy: 0.8993076121811238
Test Accuracy: 0.7391663752265285
Number of Folds: 20
Average AUC: 0.7792576982301085
Average Accuracy: 0.7792528491322667
---------------------------------------------
Training AUC: 0.8993076121811238
Test AUC: 0.6993620346693397
Training Accuracy: 0.8993076121811238
Test Accuracy: 0.7387212666518297




#### 70/30 Split

In [49]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# Initialise Random Forest classifier model
model = RandomForestClassifier()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test_scaled_70)
y_train_pred = model.predict(X_train)

# Calculate AUC and accuracy for test and training data
train_auc = roc_auc_score(y_train, y_train_pred)
test_auc = roc_auc_score(y_test_70, y_test_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test_70, y_test_pred)

# Print metrics for training and test data
print("Training AUC:", train_auc)
print("Test AUC:", test_auc)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("=============================================")




Training AUC: 0.9037954296118205
Test AUC: 0.6960452695171477
Training Accuracy: 0.9037954296118205
Test Accuracy: 0.7464125988257487


#### Cross Validation

In [51]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# Initialise decision tree classifier model
model = DecisionTreeClassifier()

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")

    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_test_pred = model.predict(X_test_scaled_70)
    y_train_pred = model.predict(X_train)

    # Calculate AUC and accuracy for test and training data
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test_70, y_test_pred)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test_70, y_test_pred)

    # Print metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


Number of Folds: 5
Average AUC: 0.7770728294052561
Average Accuracy: 0.7770768499582211
---------------------------------------------




Training AUC: 0.9038771270511893
Test AUC: 0.6945270256156021
Training Accuracy: 0.9038771270511893
Test Accuracy: 0.7346276945251065
Number of Folds: 10
Average AUC: 0.7798484744825107
Average Accuracy: 0.7798487478183584
---------------------------------------------




Training AUC: 0.9038771270511893
Test AUC: 0.6943975182052898
Training Accuracy: 0.9038771270511893
Test Accuracy: 0.7344793234277963
Number of Folds: 20
Average AUC: 0.7808829080087295
Average Accuracy: 0.7808874783040382
---------------------------------------------
Training AUC: 0.9038771270511893
Test AUC: 0.6940973475548091
Training Accuracy: 0.9038771270511893
Test Accuracy: 0.7342249729752645




### LightGBM

#### 80/20 Split

In [52]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# Initialise LightGBM classifier model
model = lgb.LGBMClassifier(random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test_scaled_80)
y_train_pred = model.predict(X_train)

# Calculate AUC and accuracy for test and training data
train_auc = roc_auc_score(y_train, y_train_pred)
test_auc = roc_auc_score(y_test_80, y_test_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test_80, y_test_pred)

# Print metrics for training and test data
print("Training AUC:", train_auc)
print("Test AUC:", test_auc)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("=============================================")


Training AUC: 0.7676109556585854
Test AUC: 0.7505888627716325
Training Accuracy: 0.7676109556585854
Test Accuracy: 0.7010142116809207


#### Cross Validation

In [54]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# Initialise LightGBM classifier model
model = lgb.LGBMClassifier(random_state=42)

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")

    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Initialise LightGBM classifier model
        model = lgb.LGBMClassifier(random_state=42)

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_test_pred = model.predict(X_test_scaled_80)
    y_train_pred = model.predict(X_train)

    # Calculate AUC and accuracy for test and training data
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test_80, y_test_pred)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test_80, y_test_pred)

    # Print metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


Number of Folds: 5
Average AUC: 0.7648755892156187
Average Accuracy: 0.7648587599266092
---------------------------------------------
Training AUC: 0.7676109556585854
Test AUC: 0.7505888627716325
Training Accuracy: 0.7676109556585854
Test Accuracy: 0.7010142116809207
Number of Folds: 10
Average AUC: 0.7655487824404748
Average Accuracy: 0.7655429778817322
---------------------------------------------
Training AUC: 0.7676109556585854
Test AUC: 0.7505888627716325
Training Accuracy: 0.7676109556585854
Test Accuracy: 0.7010142116809207
Number of Folds: 20
Average AUC: 0.7649567222737803
Average Accuracy: 0.7649404973598922
---------------------------------------------
Training AUC: 0.7676109556585854
Test AUC: 0.7505888627716325
Training Accuracy: 0.7676109556585854
Test Accuracy: 0.7010142116809207


#### 70/30 Split

In [56]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# Initialise LightGBM classifier model
model = lgb.LGBMClassifier(random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict on the test data
y_test_pred = model.predict(X_test_scaled_70)
y_train_pred = model.predict(X_train)

# Calculate AUC and accuracy for test and training data
train_auc = roc_auc_score(y_train, y_train_pred)
test_auc = roc_auc_score(y_test_70, y_test_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test_70, y_test_pred)

# Print metrics for training and test data
print("Training AUC:", train_auc)
print("Test AUC:", test_auc)
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("=============================================")


Training AUC: 0.769549030134684
Test AUC: 0.7512973656013996
Training Accuracy: 0.769549030134684
Test Accuracy: 0.7011594141461244


#### Cross Validation

In [57]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# Initialise LightGBM classifier model
model = lgb.LGBMClassifier(random_state=42)

# Lists to store evaluation metrics for each fold
auc_scores = []
accuracy_scores = []

for num_folds in num_folds_list:
    print(f"Number of Folds: {num_folds}")

    # Initialise k-fold cross-validation
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Lists to store evaluation metrics for each fold
    fold_auc_scores = []
    fold_accuracy_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in kf.split(X_train):
        X_fold_train, y_fold_train = X_train.iloc[train_index], y_train.iloc[train_index]
        X_fold_val, y_fold_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # Initialise LightGBM classifier model
        model = lgb.LGBMClassifier(random_state=42)

        # Train the model on the fold training data
        model.fit(X_fold_train, y_fold_train)

        # Predict on the fold validation data
        y_fold_val_pred = model.predict(X_fold_val)

        # Calculate AUC and accuracy for the fold
        fold_auc = roc_auc_score(y_fold_val, y_fold_val_pred)
        fold_accuracy = accuracy_score(y_fold_val, y_fold_val_pred)

        fold_auc_scores.append(fold_auc)
        fold_accuracy_scores.append(fold_accuracy)

    # Calculate average AUC and accuracy for the current number of folds
    avg_auc = sum(fold_auc_scores) / num_folds
    avg_accuracy = sum(fold_accuracy_scores) / num_folds

    # Print average metrics for cross-validation
    print("Average AUC:", avg_auc)
    print("Average Accuracy:", avg_accuracy)
    print("---------------------------------------------")
    
    # Train the model on the entire training data
    model.fit(X_train, y_train)

    # Predict on the test data
    y_test_pred = model.predict(X_test_scaled_70)
    y_train_pred = model.predict(X_train)

    # Calculate AUC and accuracy for test and training data
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test_70, y_test_pred)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test_70, y_test_pred)

    # Print metrics for training and test data
    print("Training AUC:", train_auc)
    print("Test AUC:", test_auc)
    print("Training Accuracy:", train_accuracy)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


Number of Folds: 5
Average AUC: 0.7673676751085572
Average Accuracy: 0.7673607174466086
---------------------------------------------
Training AUC: 0.769549030134684
Test AUC: 0.7512973656013996
Training Accuracy: 0.769549030134684
Test Accuracy: 0.7011594141461244
Number of Folds: 10
Average AUC: 0.767218307971391
Average Accuracy: 0.7672148366650544
---------------------------------------------
Training AUC: 0.769549030134684
Test AUC: 0.7512973656013996
Training Accuracy: 0.769549030134684
Test Accuracy: 0.7011594141461244
Number of Folds: 20
Average AUC: 0.7674007826989743
Average Accuracy: 0.7674132602821345
---------------------------------------------
Training AUC: 0.769549030134684
Test AUC: 0.7512973656013996
Training Accuracy: 0.769549030134684
Test Accuracy: 0.7011594141461244


## Neural Network

#### 8020 Split

Loop over epochs

In [59]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_80.drop("PolicyIssued", axis=1)
y_train = resampled_df_80["PolicyIssued"]

# List of epochs to iterate over
epochs_list = [10, 20, 30] 

# Create empty lists to store results
results = []

for epochs in epochs_list:
    print(f" Epochs: {epochs}")

    # Initialise a neural network model
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=32, verbose=1, validation_split=0.2)

    # Evaluate the model on the separate test data
    test_loss, test_accuracy = model.evaluate(X_test_scaled_80, y_test_80, verbose=0)

    # Print metrics for test data
    print("Test Loss:", test_loss)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


 Epochs: 10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.4537906050682068
Test Accuracy: 0.7482910752296448
 Epochs: 20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Loss: 0.46737080812454224
Test Accuracy: 0.7324579358100891
 Epochs: 30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30


Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Loss: 0.45868125557899475
Test Accuracy: 0.7487680315971375


#### 70/30 Split

In [61]:
# Splitting the dataset into features (X) and target variable (y)
X_train = resampled_df_70.drop("PolicyIssued", axis=1)
y_train = resampled_df_70["PolicyIssued"]

# List of epochs to iterate over
epochs_list = [10, 20, 30] 

# Create empty lists to store results
results = []

for epochs in epochs_list:
    print(f" Epochs: {epochs}")

    # Initialise a neural network model
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=32, verbose=1, validation_split=0.2)

    # Evaluate the model on the separate test data
    test_loss, test_accuracy = model.evaluate(X_test_scaled_70, y_test_70, verbose=0)

    # Print metrics for test data
    print("Test Loss:", test_loss)
    print("Test Accuracy:", test_accuracy)
    print("=============================================")


 Epochs: 10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.45321905612945557
Test Accuracy: 0.7495284080505371
 Epochs: 20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Loss: 0.4600769579410553
Test Accuracy: 0.7498887181282043
 Epochs: 30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30


Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Loss: 0.4540114998817444
Test Accuracy: 0.7569893598556519
