# Setup and Imports

In [27]:
import numpy as np
import pandas as pd

# # Although all the of the respective custom implementations are provided in this notebook; however the seperate modules are provided from where the custom models can be imported
# from custom_cross_validation_for_probabilistic_classification_algorithms import *
# from custom_probabilistic_classification_algorithms import *

# Model Implementation(s)

### > (Gaussian) Bayes Classifier

In [28]:
class GaussianBayes:
    ''' Implements the Gaussian Bayes For Classification without assuming feature independence. '''

    def __init__(self):
        self.means = {}
        self.covariances = {}
        self.priors = {}
        self.classes = []

    def train(self, X, Y):
        ''' Train the multiclass (or Binary) Bayes Rule using the given 
            X [m x n] data matrix and Y labels matrix'''

        # Getting the unique classes
        self.classes = np.unique(Y)
        m, n = X.shape

        for c in self.classes:
            # Selecting the data points belonging to class c
            X_c = X[Y == c]

            # Calculating the mean for each feature in class c
            self.means[c] = np.mean(X_c, axis=0)
            
            # Calculating the covariance matrix for class c
            self.covariances[c] = np.cov(X_c, rowvar=False)
            
            # Calculating the prior probability for class c
            self.priors[c] = X_c.shape[0] / m

    def test(self, X):
        ''' Run the trained classifiers on the given set of examples 
            For each example, you should return probability and its assigned class
            Input: X of m x d
            Output:
            pclasses: predicted class of each example
            probabilities: probability of each example falling in that predicted class...
        '''

        m, d = X.shape
        pclasses = []
        probabilities = np.zeros(m)

        for i in range(m):
            # Initializing max probability and class
            max_prob = -1
            best_class = None
            
            for c in self.classes:
                mean = self.means[c]
                cov = self.covariances[c]
                prior = self.priors[c]
                
                # Calculating the multivariate Gaussian probability P(X|C)
                cov_inv = np.linalg.inv(cov)  # Inverse of covariance matrix
                cov_det = np.linalg.det(cov)  # Determinant of covariance matrix
                diff = X[i] - mean
                
                likelihood = (1 / np.sqrt((2 * np.pi) ** d * cov_det)) * \
                             np.exp(-0.5 * np.dot(np.dot(diff.T, cov_inv), diff))
                
                # Calculating the posterior probability P(C|X)
                posterior = likelihood * prior
                
                # Updating max probability and class
                if posterior > max_prob:
                    max_prob = posterior
                    best_class = c
            
            # Assigning the best class and its probability
            pclasses.append(best_class)
            probabilities[i] = max_prob
        
        return pclasses, probabilities

    def predict(self, X):
        ''' Predicts the class for each example in X '''
        
        return self.test(X)[0]

### > (Gaussian) Naive Bayes Classifier

In [29]:
class GaussianNaiveBayes:
    ''' Implements the Gaussian Naive Bayes for Classification '''
    
    def __init__(self):
        self.means = {}
        self.variances = {}
        self.priors = {}
        self.classes = []
    
    def train(self, X, Y):
        ''' Train the Gaussian Naive Bayes model using X (features) and Y (labels) '''
        
        # Getting unique classes from the dataset
        self.classes = np.unique(Y)
        m, n = X.shape
        
        for c in self.classes:
            # Selecting data points for class c
            X_c = X[Y == c]

            # Calculating mean and variance for each feature in class c
            self.means[c] = np.mean(X_c, axis=0)
            self.variances[c] = np.var(X_c, axis=0)

            # Calculating prior probability for class c
            self.priors[c] = X_c.shape[0] / m
        
    def test(self, X):
        ''' Test the Gaussian Naive Bayes model on input data X
            Returns the predicted class and probability for each example '''
        
        m, d = X.shape
        pclasses = []
        probabilities = np.zeros(m)

        for i in range(m):
            max_prob = -1
            best_class = None
            
            for c in self.classes:
                # Fetching mean, variance, and prior for class c
                mean = self.means[c]
                var = self.variances[c]
                prior = self.priors[c]
                
                # Calculating the Gaussian likelihood for each feature
                likelihood = np.prod(
                    (1 / np.sqrt(2 * np.pi * var)) * 
                    np.exp(-((X[i] - mean) ** 2) / (2 * var))
                )
                
                # Calculating the posterior probability P(C|X)
                posterior = likelihood * prior
                
                # Checking if this is the best class so far
                if posterior > max_prob:
                    max_prob = posterior
                    best_class = c
            
            # Storing the best class and its probability
            pclasses.append(best_class)
            probabilities[i] = max_prob
        
        return pclasses, probabilities
    
    def predict(self, X):
        ''' Predicts the class for each example in X '''
        
        return self.test(X)[0]

### > KNN Classifier

In [30]:
class KNearestNeighbors:
    ''' Implements the K-Nearest Neighbors (KNN) algorithm for classification '''

    def __init__(self, k=3):
        self.k = k
        self.X_train = None
        self.y_train = None

    def train(self, X, Y):
        ''' Store the training data for the KNN classifier '''

        self.X_train = X
        self.y_train = Y

    def euclidean_distance(self, x1, x2):
        ''' Calculate the Euclidean distance between two points '''
        
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def classify(self, X_test_instance):
        ''' Classify a single test instance using the KNN algorithm '''

        # Computing all Euclidean distances between X_test_instance and X_train
        distances = np.array([self.euclidean_distance(X_test_instance, x_train_instance) 
                              for x_train_instance in self.X_train])

        # Finding the indices of the k smallest distances
        k_nearest_indices = np.argsort(distances)[:self.k]

        # Getting the labels of the k nearest neighbors
        k_nearest_labels = self.y_train[k_nearest_indices]

        # Performing majority voting
        unique_labels, label_counts = np.unique(k_nearest_labels, return_counts=True)

        # Returning the label with the highest count (vote)
        majority_vote_label = unique_labels[np.argmax(label_counts)]

        return majority_vote_label

    def test(self, X):
        ''' Predict the class for each instance in X '''

        predictions = [self.classify(X_test_instance) for X_test_instance in X]

        return np.array(predictions)

    def predict(self, X):
        ''' Alias for the test method '''

        return self.test(X)


### _Utility Functions_

In [31]:
def train_test_split(X, y, split_ratio):
    ''' Split the data into training and testing sets based on the split ratio '''

    total_number_of_samples = len(X)
    train_len = int(total_number_of_samples * split_ratio)
    X_train = X[:train_len]
    X_test = X[train_len:]
    y_train = y[:train_len]
    y_test = y[train_len:]
    
    return X_train, X_test, y_train, y_test

def load_dataset(file_name, extension):
    '''Load the dataset using pandas based on the file extension'''

    if extension == 'csv':
        return pd.read_csv(file_name)
    elif extension == 'xlsx':
        return pd.read_excel(file_name)
    elif extension == 'json':
        return pd.read_json(file_name)
    elif extension == 'html':
        return pd.read_html(file_name)[0]
    else:
        print("Unsupported file extension. Please provide a valid format.")
    
        return None

### _Driver Code_

In [32]:
if __name__ == "__main__":
    print("\n> Step 1: Upload the dataset")
    extension = input("Please, input the file extension (csv, xlsx, json, html): ").strip().lower()
    file_name = input("\nEnter the dataset file name (no need to include the extension): ")
    dataset = load_dataset(f"{file_name}.{extension}", extension)
    
    if dataset is None:
        print("\nFailed to load the dataset. Exiting the program.")

        exit()
    
    print("\nDataset loaded successfully.\n")
    print("> Step 2: Split the data into training and testing sets")
    split_ratio = float(input("\nEnter the train-test split ratio (0.0 to 1.0): ").strip())
    
    if not (0.0 < split_ratio < 1.0):
        print("\nInvalid split ratio. It must be between 0.0 and 1.0.")

        exit()
    
    # Assuming last column is the target column
    X = dataset.iloc[:, 1:-1].values
    y = dataset.iloc[:, -1].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, split_ratio)
    
    print("\nData has been split successfully.\n")
    print("> Step 3: Choose a probabilistic model for classification")
    print(">> 1. Gaussian Bayes Classifier")
    print(">> 2. Gaussian Naive Bayes Classifier")
    print(">> 3. K-Nearest Neighbors Classifier")
    choice = int(input("\nEnter the number corresponding to your choice: ").strip())
    
    if choice == 1:
        print("\nTraining and testing Gaussian Bayes Classifier...")
        model = GaussianBayes()
        
        model.train(X_train, y_train)
        y_pred, _ = model.predict(X_test)

        print (f"\nAccuracy: {np.sum(y_pred==y_test)/float(y_test.shape[0])}")
    elif choice == 2:
        print("\nTraining and testing Gaussian Naive Bayes Classifier...")
        model = GaussianNaiveBayes()

        model.train(X_train, y_train)
        y_pred, _ = model.predict(X_test)

        print (f"\nAccuracy: {np.sum(y_pred==y_test)/float(y_test.shape[0])}")
    elif choice == 3:
        k = int(input("\nEnter the value of k for KNN: ").strip())
        print("\nTraining and testing K-Nearest Neighbors Classifier...")
        model = KNearestNeighbors(k)
        
        model.train(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = np.mean(y_pred == y_test)

        print(f"\nAccuracy: {accuracy}")
    else:
        print("Invalid choice. Exiting the program.")
        
        exit()


> Step 1: Upload the dataset

Dataset loaded successfully.

> Step 2: Split the data into training and testing sets

Data has been split successfully.

> Step 3: Choose a probabilistic model for classification
>> 1. Gaussian Bayes Classifier
>> 2. Gaussian Naive Bayes Classifier
>> 3. K-Nearest Neighbors Classifier

Training and testing Gaussian Bayes Classifier...

Accuracy: 0.2222222222222222


### _For, Testing and Debugging_

In [33]:
iris_dataset = pd.read_csv("Iris.csv")

X = iris_dataset.iloc[:, 1:-1].values
y = iris_dataset.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, 0.7)

# model = GaussianBayes()
# model = GaussianNaiveBayes()
model = KNearestNeighbors()

model.train(X_train, y_train)

# y_pred, _ = model.test(X_test)
y_pred = model.test(X_test)

# print (f"Accuracy: {np.sum(y_pred==y_test)/float(y_test.shape[0])}")
print(f"Accuracy: {(np.mean(y_pred == y_test)):.2f}")

Accuracy: 0.62


# Cross Validation Implementation

### > (Simple) K-fold Cross Validation

In [34]:
class KFoldCV:

    def __init__(self, k=5):
        self.k = k

    def split(self, X, y):
        """
        Splits the data into k folds.
        """

        num_samples = X.shape[0]
        indices = np.arange(num_samples)
        np.random.shuffle(indices)
        fold_size = num_samples // self.k
        folds = []
        
        for i in range(self.k):
            start_index = i * fold_size
            end_index = start_index + fold_size if i != self.k - 1 else num_samples
            fold_indices = indices[start_index:end_index]
            folds.append(fold_indices)

        return folds

    def train_test_split(self, X, y, fold):
        """
        Splits the data into training and testing sets based on the fold.
        """

        test_indices = fold
        train_indices = np.array([i for i in range(X.shape[0]) if i not in test_indices])

        X_train, y_train = X[train_indices], y[train_indices]
        X_test, y_test = X[test_indices], y[test_indices]

        return X_train, y_train, X_test, y_test

    def cross_validate(self, X, y, model):
        """
        Performs k-fold cross-validation on the given model.
        """
        folds = self.split(X, y)
        scores = []

        for fold in folds:
            X_train, y_train, X_test, y_test = self.train_test_split(X, y, fold)
            model.train(X_train, y_train)
            predictions = model.predict(X_test)
            scores.append(np.mean(predictions == y_test))

        return np.mean(scores), np.std(scores)

### > Stratified K-fold Cross Validation

In [35]:
class StratifiedKFoldCV:

    def __init__(self, k=5):
        self.k = k

    def split(self, X, y):
        """
        Splits the data into k stratified folds, ensuring that each fold has
        a similar distribution of class labels as the original dataset.
        """
    
        num_samples = X.shape[0]
        unique_classes, class_counts = np.unique(y, return_counts=True)
        class_indices = {cls: np.where(y == cls)[0] for cls in unique_classes}
        
        # Shuffling class indices to randomize the data within each class
        for cls in unique_classes:
            np.random.shuffle(class_indices[cls])
        
        folds = [[] for _ in range(self.k)]
        
        # Distributing samples across folds while maintaining class proportions
        for cls in unique_classes:
            cls_indices = class_indices[cls]
            fold_size = len(cls_indices) // self.k
            
            for i in range(self.k):
                start_index = i * fold_size
                end_index = start_index + fold_size if i != self.k - 1 else len(cls_indices)
                fold_indices = cls_indices[start_index:end_index]
                folds[i].extend(fold_indices)
        
        folds = [np.array(fold) for fold in folds]
        
        return folds

    def train_test_split(self, X, y, fold):
        """
        Splits the data into training and testing sets based on the fold.
        """
        
        test_indices = fold
        train_indices = np.array([i for i in range(X.shape[0]) if i not in test_indices])

        X_train, y_train = X[train_indices], y[train_indices]
        X_test, y_test = X[test_indices], y[test_indices]

        return X_train, y_train, X_test, y_test

    def cross_validate(self, X, y, model):
        """
        Performs stratified k-fold cross-validation on the given model.
        """
        
        folds = self.split(X, y)
        scores = []

        for fold in folds:
            X_train, y_train, X_test, y_test = self.train_test_split(X, y, fold)
            model.train(X_train, y_train)
            predictions = model.predict(X_test)
            scores.append(np.mean(predictions == y_test))

        return np.mean(scores), np.std(scores)

### _Driver Code_

In [36]:
if __name__ == "__main__":
    print("> Step 5: Choose the cross-validation method")
    cv_method = input("Enter 'k' for K-Fold or 's' for Stratified K-Fold: ").strip().lower()

    k = int(input("\nEnter the number of folds (k): "))

    if cv_method == 'k':
        cv = KFoldCV(k)
    elif cv_method == 's':
        cv = StratifiedKFoldCV(k)
    else:
        print("\nInvalid cross-validation method selected.")

        exit()

    print("\n> Step 6: Performing cross-validation...")
    mean_score, std_score = cv.cross_validate(X, y, model)

    print(f"\nCross-validation results:\n> Mean accuracy: {mean_score:.4f}\n> Standard deviation: {std_score:.4f}")

> Step 5: Choose the cross-validation method

> Step 6: Performing cross-validation...

Cross-validation results:
> Mean accuracy: 0.9600
> Standard deviation: 0.0163


# Comparitive Analysis of the Custom and _Sklearn_ Based Implementations

In [37]:
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

### _Utility Functions_ 

In [50]:
def run_custom_cv(model, cv, X, y):
    """
    Run cross-validation with a custom model and custom CV.
    """
    mean_score, std_score = cv.cross_validate(X, y, model)
    
    return mean_score, std_score

def run_sklearn_cv(model, cv, X, y):
    """
    Run cross-validation using sklearn's cross-validation methods.
    """

    scores = []
    
    for train_index, test_index in cv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        scores.append(np.mean(predictions == y_test))

    return np.mean(scores), np.std(scores)

def compare_classifiers(custom_model, sklearn_model, custom_cv, sklearn_cv, X, y):
    """
    Compare custom and sklearn classifiers using both custom and sklearn CV.
    """
    
    print("\n> Step 5: Comparative Analysis Results:\n")
    
    print(">> Classification Reports:")
    print("Custom Model Classification Report:")
    custom_model.train(X, y)
    y_pred_custom = np.array(custom_model.predict(X))  # Assuming custom model is trained on full dataset
    print(metrics.classification_report(y, y_pred_custom))

    print("\nSklearn Model Classification Report:")
    sklearn_model.fit(X, y)  # Train sklearn model on full dataset
    y_pred_sklearn = sklearn_model.predict(X)
    print(metrics.classification_report(y, y_pred_sklearn))

    # # For, testing
    # print(type(y_pred_custom))
    # print(type(y_pred_sklearn))
    # print(y_pred_custom.shape)
    # print(y_pred_sklearn.shape)

    print("\n>> Cross Validation Results:")
    custom_mean, custom_std = run_custom_cv(custom_model, custom_cv, X, y)
    print(f"Custom Model - Custom CV: Mean Score = {custom_mean:.4f}, Std Score = {custom_std:.4f}")

    sklearn_mean, sklearn_std = run_sklearn_cv(sklearn_model, sklearn_cv, X, y)
    print(f"Sklearn Model - Sklearn CV: Mean Score = {sklearn_mean:.4f}, Std Score = {sklearn_std:.4f}")


In [51]:
if __name__ == "__main__":
    print("\n> Step 1: Upload the dataset")
    extension = input("Please, input the file extension (csv, xlsx, json, html): ").strip().lower()
    file_name = input("\nEnter the dataset file name (no need to include the extension): ")
    dataset = load_dataset(f"{file_name}.{extension}", extension)
    
    if dataset is None:
        print("\nFailed to load the dataset. Exiting the program.")

        exit()
    
    print("\nDataset loaded successfully.\n")
    print("> Step 2: Split the data into training and testing sets")
    split_ratio = float(input("\nEnter the train-test split ratio (0.0 to 1.0): ").strip())
    
    if not (0.0 < split_ratio < 1.0):
        print("\nInvalid split ratio. It must be between 0.0 and 1.0.")

        exit()
    
    # Assuming last column is the target column
    X = dataset.iloc[:, 1:-1].values
    y = dataset.iloc[:, -1].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, split_ratio)
    
    print("\nData has been split successfully.\n")

    print("> Step 3: Choose the cross-validation method")
    cv_method = input("Enter 'k' for K-Fold or 's' for Stratified K-Fold: ").strip().lower()
    k = int(input("\nEnter the number of folds (k): "))

    if cv_method == 'k':
        custom_cv = KFoldCV(k)
        sklearn_cv = KFold(n_splits=k, shuffle=True, random_state=42)
    elif cv_method == 's':
        custom_cv = StratifiedKFoldCV(k)
        sklearn_cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    else:
        print("\nInvalid cross-validation method selected.")

        exit()

    print("> Step 4: Choose a probabilistic model for classification")
    print(">> 1. Gaussian Bayes Classifier")
    print(">> 2. Gaussian Naive Bayes Classifier")
    print(">> 3. K-Nearest Neighbors Classifier")
    choice = int(input("\nEnter the number corresponding to your choice: ").strip())

    if choice == 1:
        custom_model = GaussianNaiveBayes()
        sklearn_model = GaussianNB()
    elif choice == 2:
        custom_model = n_neighbors = int(input("Enter the number of neighbors for KNN: "))
        KNearestNeighbors(k)
        sklearn_model = KNeighborsClassifier(n_neighbors=n_neighbors)
    else:
        print("\nInvalid classifier selected.")

        exit()

    compare_classifiers(custom_model, sklearn_model, custom_cv, sklearn_cv, X, y)



> Step 1: Upload the dataset

Dataset loaded successfully.

> Step 2: Split the data into training and testing sets

Data has been split successfully.

> Step 3: Choose the cross-validation method
> Step 4: Choose a probabilistic model for classification
>> 1. Gaussian Bayes Classifier
>> 2. Gaussian Naive Bayes Classifier
>> 3. K-Nearest Neighbors Classifier

> Step 5: Comparative Analysis Results:

>> Classification Reports:
Custom Model Classification Report:
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        50
Iris-versicolor       0.94      0.94      0.94        50
 Iris-virginica       0.94      0.94      0.94        50

       accuracy                           0.96       150
      macro avg       0.96      0.96      0.96       150
   weighted avg       0.96      0.96      0.96       150


Sklearn Model Classification Report:
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.