# Machine Learning CourseworK

Natalie Fernando: 20222466/2312542

In [1]:
#pip install ucimlrepo , download if the libaries dosenot exist 
#pip install imblearn

 #Import required libraries
from ucimlrepo import fetch_ucirepo
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

# Step 1: Fetch the dataset using the ucimlrepo library
bank_marketing = fetch_ucirepo(id=222)  # Bank Marketing Dataset

# Extract features (X) and target (y)
X = bank_marketing.data.features
y = bank_marketing.data.targets

# Display dataset metadata and structure
print("Dataset Metadata:\n", bank_marketing.metadata)
print("\nDataset Variables:\n", bank_marketing.variables)
print("\nFeatures Sample:\n", X.head())
print("\nTarget Sample:\n", y.head())

# Step 2: Handle Missing Values
print("\nMissing Values Check:\n", X.isnull().sum())
# No missing values are expected, but we can still fill any that arise
X.fillna(method='ffill', inplace=True)

# Step 3: Drop Irrelevant Features
# 'duration' is a leakage feature and must be removed if present
X = X.drop(columns=['duration'], errors='ignore')

# Step 4: Create New Features
# Bin 'age' into categories
X['age_group'] = pd.cut(X['age'], bins=[0, 25, 45, 65, 100], labels=['Young', 'Adult', 'Middle-aged', 'Senior'])
# One-hot encode the new 'age_group' feature
X = pd.get_dummies(X, columns=['age_group'], drop_first=True)

# Step 5: Encode Categorical Variables
# One-hot encode all nominal categorical variables
X = pd.get_dummies(X, drop_first=True)

# Encode the target variable (y) using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Step 6: Handle Class Imbalance
# Apply SMOTE to balance the target classes
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y_encoded)
print("\nBalanced Class Distribution:\n", pd.Series(y_balanced).value_counts())

# Step 7: Rescale Data
# Standardize numerical features for Neural Network compatibility
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_balanced)

# Step 8: Split the Dataset
# Split into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)

# Final shapes
print("\nFinal Training and Testing Data Shapes:")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}, y_test: {y_test.shape}")

FileNotFoundError: [Errno 2] No such file or directory: 'path_to_dataset/bank-additional-full.csv'

RANDOM FROEST USING 50 TREES

In [None]:
import numpy as np
from collections import Counter

class RandomForest:
    def __init__(self, n_estimators=10, max_depth=10, sample_size=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.sample_size = sample_size
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_estimators):
            idxs = np.random.choice(len(y), self.sample_size or len(y), replace=True)
            tree = DecisionTree(max_depth=self.max_depth)
            tree.fit(X[idxs], y[idxs])
            self.trees.append(tree)

    def predict(self, X):
        # Collect predictions from all trees
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        
        # Display each tree's prediction
        print("\nPredictions from each tree:")
        for i, tree_pred in enumerate(tree_preds, start=1):
            print(f"Tree {i}: {tree_pred}")
        
        # Majority voting for each sample
        majority_predictions = np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], axis=0, arr=tree_preds)
        
        # Final majority decision for all samples
        overall_majority = Counter(majority_predictions).most_common(1)[0][0]
        
        # Convert to "Yes" or "No"
        final_decision = "Yes" if overall_majority == 1 else "No"
        
        # Print the final outcome message
        print(f"\nFinal Prediction Based on Majority Voting: {final_decision}")
        return majority_predictions


rf = RandomForest(n_estimators=50, max_depth=5)  # Using 50 trees
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)


Neural Network with 10 Hidden layers with each hidden layer containing 10 neurones

In [None]:
import numpy as np

class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
        self.learning_rate = learning_rate
        self.weights_input_hidden = np.random.randn(input_size, hidden_size)
        self.weights_hidden_output = np.random.randn(hidden_size, output_size)
        self.bias_hidden = np.zeros((1, hidden_size))
        self.bias_output = np.zeros((1, output_size))

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def sigmoid_derivative(self, z):
        return z * (1 - z)

    def forward(self, X):
        self.hidden_layer = self.sigmoid(np.dot(X, self.weights_input_hidden) + self.bias_hidden)
        self.output_layer = self.sigmoid(np.dot(self.hidden_layer, self.weights_hidden_output) + self.bias_output)
        return self.output_layer

    def backward(self, X, y, output):
        error = y - output
        output_delta = error * self.sigmoid_derivative(output)
        hidden_error = np.dot(output_delta, self.weights_hidden_output.T)
        hidden_delta = hidden_error * self.sigmoid_derivative(self.hidden_layer)

        # Update weights and biases
        self.weights_hidden_output += np.dot(self.hidden_layer.T, output_delta) * self.learning_rate
        self.weights_input_hidden += np.dot(X.T, hidden_delta) * self.learning_rate
        self.bias_output += np.sum(output_delta, axis=0, keepdims=True) * self.learning_rate
        self.bias_hidden += np.sum(hidden_delta, axis=0, keepdims=True) * self.learning_rate

    def train(self, X, y, epochs=1000):
        for epoch in range(epochs):
            output = self.forward(X)
            self.backward(X, y, output)

    def predict(self, X):
        output = self.forward(X)
        predictions = (output > 0.5).astype(int)
        
        # Display outputs for each sample
        print("\nOutputs for each sample from Neural Network:")
        for i, pred in enumerate(output.flatten(), start=1):
            print(f"Sample {i}: {'Yes' if pred > 0.5 else 'No'} (Raw Output: {pred:.4f})")
        
        # Majority decision for all samples
        overall_majority = Counter(predictions.flatten()).most_common(1)[0][0]
        final_decision = "Yes" if overall_majority == 1 else "No"
        
        # Print the final outcome message
        print(f"\nFinal Prediction Based on Majority Decision: {final_decision}")
        return predictions

# Example Usage
# Assuming X_train and y_train are preprocessed NumPy arrays
nn = NeuralNetwork(input_size=X_train.shape[1], hidden_size=10, output_size=1)  # 10 hidden layers
nn.train(X_train, y_train, epochs=1000)
y_pred_nn = nn.predict(X_test)


# Model evalulation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# Function to plot the confusion matrix
def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No", "Yes"], yticklabels=["No", "Yes"])
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

# Dictionary to store evaluation metrics
classification_reports = {}
accuracies = {}

# Evaluate each Random Forest model (trees trained in the custom implementation)
for n_trees, rf in rf_models.items():  # rf_models: dictionary containing trained RF models
    print(f"\nEvaluating Random Forest with {n_trees} trees...")
    
    # Predict on the test set
    y_pred_rf = rf.predict(X_test_manual)  # Replace X_test_manual and y_test_manual with actual variables
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test_manual, y_pred_rf)
    accuracies[n_trees] = accuracy
    print(f"Accuracy for {n_trees} trees: {accuracy:.4f}")
    
    # Generate and store classification report
    report = classification_report(y_test_manual, y_pred_rf, target_names=["No", "Yes"])
    classification_reports[n_trees] = report
    print(f"\nClassification Report for {n_trees} trees:")
    print(report)
    
    # Plot confusion matrix
    plot_confusion_matrix(y_test_manual, y_pred_rf, title=f"Confusion Matrix for {n_trees} trees")

# Display overall accuracies
print("\nRandom Forest Accuracies by Number of Trees:")
for n_trees, acc in accuracies.items():
    print(f"{n_trees} Trees: {acc:.4f}")


In [None]:
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Function to plot the confusion matrix
def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix (Test Data)"):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No", "Yes"], yticklabels=["No", "Yes"])
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

# Dictionary to store test results
test_rf_results = {}
test_classification_reports = {}

# Evaluate each Random Forest model on the test data
for n_trees, rf in rf_models.items():  # rf_models: dictionary containing trained RF models
    print(f"\nEvaluating Random Forest with {n_trees} trees on Test Data...")
    
    # Make predictions on the test data
    y_pred_test = rf.predict(X_test_manual)  # Replace `X_test_manual` and `y_test_manual` with actual variables
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test_manual, y_pred_test)
    test_rf_results[n_trees] = accuracy
    
    # Generate and store classification report
    report = classification_report(y_test_manual, y_pred_test, target_names=["No", "Yes"])
    test_classification_reports[n_trees] = report
    
    # Display results
    print(f"Test Results for {n_trees} trees:")
    print(f"Accuracy: {accuracy:.2f}")
    print(report)
    
    # Plot confusion matrix
    plot_confusion_matrix(y_test_manual, y_pred_test, title=f"Confusion Matrix for {n_trees} trees (Test Data)")

# Display overall test accuracies
print("\nOverall Test Accuracies for Random Forest:")
for n_trees, acc in test_rf_results.items():
    print(f"{n_trees} Trees: {acc:.2f}")


Tetsing the model with new data for the Random Forest

In [None]:
import numpy as np
import pandas as pd

# Generate new test data (10 new instances)
# Assuming the feature structure matches the training data
new_data = {
    'age': np.random.randint(18, 70, 10),
    'balance': np.random.randint(-1000, 5000, 10),
    'day': np.random.randint(1, 31, 10),
    'campaign': np.random.randint(1, 10, 10),
    'pdays': np.random.randint(-1, 300, 10),
    'previous': np.random.randint(0, 10, 10),
    'job_blue-collar': np.random.randint(0, 2, 10),
    'job_entrepreneur': np.random.randint(0, 2, 10),
    'job_housemaid': np.random.randint(0, 2, 10),
    'job_management': np.random.randint(0, 2, 10),
    'job_retired': np.random.randint(0, 2, 10),
    'job_self-employed': np.random.randint(0, 2, 10),
    'job_services': np.random.randint(0, 2, 10),
    'job_student': np.random.randint(0, 2, 10),
    'job_technician': np.random.randint(0, 2, 10),
    'job_unemployed': np.random.randint(0, 2, 10),
    'job_unknown': np.random.randint(0, 2, 10),
    'marital_married': np.random.randint(0, 2, 10),
    'marital_single': np.random.randint(0, 2, 10),
    'education_secondary': np.random.randint(0, 2, 10),
    'education_tertiary': np.random.randint(0, 2, 10),
    'education_unknown': np.random.randint(0, 2, 10),
    'default_yes': np.random.randint(0, 2, 10),
    'housing_yes': np.random.randint(0, 2, 10),
    'loan_yes': np.random.randint(0, 2, 10),
    'contact_telephone': np.random.randint(0, 2, 10),
    'contact_unknown': np.random.randint(0, 2, 10),
    'month_aug': np.random.randint(0, 2, 10),
    'month_dec': np.random.randint(0, 2, 10),
    'month_feb': np.random.randint(0, 2, 10),
    'month_jan': np.random.randint(0, 2, 10),
    'month_jul': np.random.randint(0, 2, 10),
    'month_jun': np.random.randint(0, 2, 10),
    'month_mar': np.random.randint(0, 2, 10),
    'month_may': np.random.randint(0, 2, 10),
    'month_nov': np.random.randint(0, 2, 10),
    'month_oct': np.random.randint(0, 2, 10),
    'month_sep': np.random.randint(0, 2, 10),
    'poutcome_other': np.random.randint(0, 2, 10),
    'poutcome_success': np.random.randint(0, 2, 10),
    'poutcome_unknown': np.random.randint(0, 2, 10),
}

# Convert the new data into a DataFrame
new_data_df = pd.DataFrame(new_data)

# Rescale the new data using the previously fitted scaler
new_data_scaled = scaler.transform(new_data_df)

# Store predictions for all Random Forest configurations
new_predictions = {}

for n_trees, rf in rf_models.items():  # rf_models contains trained RF models
    # Predict the outcomes for the new data
    y_pred_new = rf.predict(new_data_scaled)
    new_predictions[n_trees] = y_pred_new
    print(f"\nPredictions for {n_trees} trees:")
    print(y_pred_new)

# Display predictions for all configurations
print("\nSummary of Predictions for New Data:")
for n_trees, predictions in new_predictions.items():
    print(f"{n_trees} Trees: {predictions}")


In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Function to plot the confusion matrix
def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix (Validation Set)"):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No", "Yes"], yticklabels=["No", "Yes"])
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

# Dictionary to store validation results
validation_rf_results = {}
validation_classification_reports = {}

# Evaluate each Random Forest model on the test data (acting as the validation set)
for n_trees, rf in rf_models.items():  # rf_models contains trained RF models
    print(f"\nEvaluating Random Forest with {n_trees} trees on Validation Set...")
    
    # Make predictions on the test data (validation set)
    y_pred_validation = rf.predict(X_test_manual)  # Replace X_test_manual and y_test_manual with actual variables
    
    # Calculate accuracy
    accuracy_validation = accuracy_score(y_test_manual, y_pred_validation)
    validation_rf_results[n_trees] = accuracy_validation
    
    # Generate and store classification report
    report = classification_report(y_test_manual, y_pred_validation, target_names=["No", "Yes"])
    validation_classification_reports[n_trees] = report
    
    # Display results
    print(f"Validation Results for {n_trees} trees:")
    print(f"Accuracy: {accuracy_validation:.2f}")
    print(report)
    
    # Plot confusion matrix
    plot_confusion_matrix(y_test_manual, y_pred_validation, title=f"Confusion Matrix for {n_trees} trees (Validation)")

# Display overall validation accuracies
print("\nOverall Validation Accuracies for Random Forest:")
for n_trees, acc in validation_rf_results.items():
    print(f"{n_trees} Trees: {acc:.2f}")


In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Function to plot the confusion matrix
def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No", "Yes"], yticklabels=["No", "Yes"])
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

# List to store results
nn_results = []

# Train and evaluate the custom Neural Network with varying hidden layers
for hidden_layers in [2, 4, 8, 10, 16, 20, 25]:
    print(f"\nTraining Neural Network with {hidden_layers} hidden layers...")
    
    # Create and train the Neural Network model
    nn = NeuralNetwork(input_size=X_train_manual.shape[1], hidden_size=hidden_layers, output_size=1)
    nn.train(X_train_manual, y_train_manual, epochs=1000)
    
    # Predict using the trained Neural Network
    y_pred_nn = nn.predict(X_test_manual).flatten()
    
    # Calculate accuracy
    accuracy_nn = accuracy_score(y_test_manual, y_pred_nn)
    
    # Generate classification report
    report_nn = classification_report(y_test_manual, y_pred_nn, target_names=["No", "Yes"])
    
    # Store results
    nn_results.append((hidden_layers, accuracy_nn, report_nn))
    
    # Display results
    print(f"Results for {hidden_layers} hidden layers:")
    print(f"Accuracy: {accuracy_nn:.2f}")
    print(report_nn)
    
    # Confusion Matrix for each model
    plot_confusion_matrix(y_test_manual, y_pred_nn, title=f"Confusion Matrix for {hidden_layers} hidden layers (NN)")

# Display the results summary for Neural Network
nn_results_summary = {result[0]: result[1] for result in nn_results}
print("\nSummary of Results for Neural Network:")
for hidden_layers, accuracy in nn_results_summary.items():
    print(f"{hidden_layers} Hidden Layers: Accuracy = {accuracy:.2f}")


In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Function to plot the confusion matrix
def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix (Test Data)"):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No", "Yes"], yticklabels=["No", "Yes"])
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

# Dictionary to store test results
test_nn_results = {}
test_nn_classification_reports = {}

# Evaluate custom Neural Network with different hidden layer configurations on the test data
for hidden_layers in [2, 4, 8, 10, 16, 20, 25]:
    print(f"\nEvaluating Neural Network with {hidden_layers} hidden layers on test data...")
    
    # Create and train Neural Network model with specified number of hidden layers
    nn = NeuralNetwork(input_size=X_train_manual.shape[1], hidden_size=hidden_layers, output_size=1)
    nn.train(X_train_manual, y_train_manual, epochs=1000)
    
    # Predict using the trained Neural Network
    y_pred_nn_test = nn.predict(X_test_manual).flatten()
    
    # Calculate accuracy
    accuracy_test_nn = accuracy_score(y_test_manual, y_pred_nn_test)
    test_nn_results[hidden_layers] = accuracy_test_nn
    
    # Generate classification report
    report_nn_test = classification_report(y_test_manual, y_pred_nn_test, target_names=["No", "Yes"])
    test_nn_classification_reports[hidden_layers] = report_nn_test
    
    # Display results
    print(f"Test Results for {hidden_layers} hidden layers:")
    print(f"Accuracy: {accuracy_test_nn:.2f}")
    print(report_nn_test)
    
    # Confusion Matrix for test data
    plot_confusion_matrix(y_test_manual, y_pred_nn_test, title=f"Confusion Matrix for {hidden_layers} hidden layers (NN - Test)")

# Display overall test results
print("\nOverall Test Accuracies for Neural Network:")
for hidden_layers, acc in test_nn_results.items():
    print(f"{hidden_layers} Hidden Layers: {acc:.2f}")


Tetsing the model with new data for the NN

In [None]:
import numpy as np
import pandas as pd

# Generate new data (10 new instances)
new_data = {
    'age': np.random.randint(18, 70, 10),
    'balance': np.random.randint(-1000, 5000, 10),
    'day': np.random.randint(1, 31, 10),
    'campaign': np.random.randint(1, 10, 10),
    'pdays': np.random.randint(-1, 300, 10),
    'previous': np.random.randint(0, 10, 10),
    'job_blue-collar': np.random.randint(0, 2, 10),
    'job_entrepreneur': np.random.randint(0, 2, 10),
    'job_housemaid': np.random.randint(0, 2, 10),
    'job_management': np.random.randint(0, 2, 10),
    'job_retired': np.random.randint(0, 2, 10),
    'job_self-employed': np.random.randint(0, 2, 10),
    'job_services': np.random.randint(0, 2, 10),
    'job_student': np.random.randint(0, 2, 10),
    'job_technician': np.random.randint(0, 2, 10),
    'job_unemployed': np.random.randint(0, 2, 10),
    'job_unknown': np.random.randint(0, 2, 10),
    'marital_married': np.random.randint(0, 2, 10),
    'marital_single': np.random.randint(0, 2, 10),
    'education_secondary': np.random.randint(0, 2, 10),
    'education_tertiary': np.random.randint(0, 2, 10),
    'education_unknown': np.random.randint(0, 2, 10),
    'default_yes': np.random.randint(0, 2, 10),
    'housing_yes': np.random.randint(0, 2, 10),
    'loan_yes': np.random.randint(0, 2, 10),
    'contact_telephone': np.random.randint(0, 2, 10),
    'contact_unknown': np.random.randint(0, 2, 10),
    'month_aug': np.random.randint(0, 2, 10),
    'month_dec': np.random.randint(0, 2, 10),
    'month_feb': np.random.randint(0, 2, 10),
    'month_jan': np.random.randint(0, 2, 10),
    'month_jul': np.random.randint(0, 2, 10),
    'month_jun': np.random.randint(0, 2, 10),
    'month_mar': np.random.randint(0, 2, 10),
    'month_may': np.random.randint(0, 2, 10),
    'month_nov': np.random.randint(0, 2, 10),
    'month_oct': np.random.randint(0, 2, 10),
    'month_sep': np.random.randint(0, 2, 10),
    'poutcome_other': np.random.randint(0, 2, 10),
    'poutcome_success': np.random.randint(0, 2, 10),
    'poutcome_unknown': np.random.randint(0, 2, 10),
}

# Convert the new data into a DataFrame
new_data_df = pd.DataFrame(new_data)

# Ensure that the new data has the same feature structure as the training data
missing_columns = [col for col in X.columns if col not in new_data_df.columns]
for col in missing_columns:
    new_data_df[col] = 0  # Assigning 0 to the missing columns

# Reorder the columns to match the original training data
new_data_df = new_data_df[X.columns]

# Rescale the new data using the previously fitted scaler
new_data_scaled = scaler.transform(new_data_df)

# Store predictions for all hidden layer configurations
new_predictions_nn = {}

for hidden_layers in [2, 4, 8, 10, 16, 20, 25]:
    print(f"\nGenerating predictions for Neural Network with {hidden_layers} hidden layers...")
    
    # Create and train Neural Network model with specified number of hidden layers
    nn = NeuralNetwork(input_size=X_train_manual.shape[1], hidden_size=hidden_layers, output_size=1)
    nn.train(X_train_manual, y_train_manual, epochs=1000)
    
    # Predict the outcomes for the new data
    y_pred_new_nn = nn.predict(new_data_scaled).flatten()
    new_predictions_nn[hidden_layers] = y_pred_new_nn

    # Display the predictions for the current configuration
    print(f"Predictions for {hidden_layers} hidden layers:")
    print(y_pred_new_nn)

# Display all predictions
print("\nSummary of Predictions for New Data:")
for hidden_layers, predictions in new_predictions_nn.items():
    print(f"{hidden_layers} Hidden Layers: {predictions}")


Evaluation During Validation

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Function to plot the confusion matrix
def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix (Validation Set)"):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No", "Yes"], yticklabels=["No", "Yes"])
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

# Dictionary to store validation results
validation_nn_results = {}
validation_nn_classification_reports = {}

# Evaluate custom Neural Network with different hidden layer configurations
for hidden_layers in [2, 4, 8, 10, 16, 20, 25]:
    print(f"\nEvaluating Neural Network with {hidden_layers} hidden layers on validation (test) data...")
    
    # Create and train Neural Network model with specified number of hidden layers
    nn = NeuralNetwork(input_size=X_train_manual.shape[1], hidden_size=hidden_layers, output_size=1)
    nn.train(X_train_manual, y_train_manual, epochs=1000)
    
    # Predict using the trained Neural Network
    y_pred_nn_val = nn.predict(X_test_manual).flatten()
    
    # Calculate accuracy
    accuracy_nn_val = accuracy_score(y_test_manual, y_pred_nn_val)
    validation_nn_results[hidden_layers] = accuracy_nn_val
    
    # Generate classification report
    report_nn_val = classification_report(y_test_manual, y_pred_nn_val, target_names=["No", "Yes"])
    validation_nn_classification_reports[hidden_layers] = report_nn_val
    
    # Display results
    print(f"Validation Results for {hidden_layers} hidden layers:")
    print(f"Accuracy: {accuracy_nn_val:.2f}")
    print(report_nn_val)
    
    # Confusion Matrix for validation data
    plot_confusion_matrix(y_test_manual, y_pred_nn_val, title=f"Confusion Matrix for {hidden_layers} hidden layers (NN - Validation)")

# Display overall validation results
print("\nOverall Validation Accuracies for Neural Network:")
for hidden_layers, acc in validation_nn_results.items():
    print(f"{hidden_layers} Hidden Layers: {acc:.2f}")


# A/B Testing for the RF and NN

Need to compare how each model performs on the new values. Since we don't have true labels for the new data, we can approach this in the following ways:

Consistency: Compare whether both models (Random Forest and MLP) make similar predictions. If they predict similarly, the models might be agreeing on the classification of the new data.
Diversity of Predictions: Compare how often the models disagree, which can give us insight into their relative strengths and weaknesses.

Group A- RF
Group B- NN

In [None]:
# Example Predictions for Random Forest (50 trees)
rf_predictions_new = [0, 1, 0, 1, 1, 1, 0, 0, 0, 0]  # Replace with actual predictions from RF model

# Example Predictions for Neural Network (10 hidden layers)
nn_predictions_new = [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]  # Replace with actual predictions from NN model

# Initialize counters for agreement and disagreement
agreement_count = sum(1 for i in range(len(rf_predictions_new)) if rf_predictions_new[i] == nn_predictions_new[i])
disagreement_count = len(rf_predictions_new) - agreement_count

# Calculate percentage agreement and disagreement
total_comparisons = len(rf_predictions_new)
agreement_percentage = (agreement_count / total_comparisons) * 100
disagreement_percentage = (disagreement_count / total_comparisons) * 100

# Display the results
print("\nComparison of Predictions:")
print(f"Agreement: {agreement_percentage:.2f}%")
print(f"Disagreement: {disagreement_percentage:.2f}%")
