In [3]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA  # Import PCA from sklearn

# Load the MNIST dataset
mnist = fetch_openml("mnist_784", parser='auto')

# Change data type from pandas DataFrame to numpy arrays
X, y = mnist.data.values, mnist.target.values

# Binarize the data (if needed)
X_binary = (X > 127).astype(int)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_binary, y, test_size=0.2, random_state=13)





In [13]:
# Apply PCA to reduce the dimensionality of the data
n_components = 50# Adjust the number of components as needed
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)


In [14]:

# Create and train your Bernoulli Naive Bayes model (you can use your existing code for this)
#6. make a Bernoulli-Naive-Base Model Class
class BernoulliNBWithLog:
    def fit(self, X, y):
        self.classes = np.unique(y)
        ## prior probability
        # Calculate and store the log of class priors - the likelihood of each class occurring
        self.class_priors = np.array([np.log(np.mean(y == c)) for c in self.classes])
        
        ## conditional probability
        # calculate the likelihood of a set of features (the feature vector) given a class
        self.feature_probs = []
        for c in self.classes:
            feature_prob = (X[y == c].sum(axis=0)) / (np.sum(y == c))
            # to avoid extreme value, set the scope using clip function
            self.feature_probs.append(np.log(np.clip(feature_prob, 1e-10, 1.0 - 1e-10)))
        self.feature_probs = np.array(self.feature_probs)

    def predict(self, X):
        ## posterior probability
        # calculate matrix dot prodcut to relfect relation between class and features
        log_likelihoods = np.dot(X, self.feature_probs.T)
        log_posteriors = log_likelihoods + self.class_priors
        predicted_class = self.classes[np.argmax(log_posteriors, axis=1)]
            # 2D NumPy array : axis=0 > vertical axis (columns) /  axis=1 > horizontal axis (rows).
        return predicted_class

# Initialize and train the model
model = BernoulliNBWithLog()
model.fit(X_train_pca, y_train)

# Evaluate the model using the test data
y_pred = model.predict(X_test_pca)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.3f}%".format(accuracy * 100))

Accuracy: 74.264%
