In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

class GaussianNaiveBayes:
    #calculate mean, variance, and class prior probabilities
    def fit(self, X, y):
            self.classes = np.unique(y)
            self.parameters = {}
            for c in self.classes:
                X_c = X[y == c]
                self.parameters[c] = {
                    'mean': X_c.mean(axis=0),
                    'var': X_c.var(axis=0),
                    'prior': X_c.shape[0] / X.shape[0]
                }

    #compute the Gaussian likelihood
    def _calculate_likelihood(self, class_id, x):
        mean = self.parameters[class_id]['mean']
        var = self.parameters[class_id]['var']
        numerator = np.exp(- (x - mean) ** 2 / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

    #calculate the posterior probability for each class
    def _calculate_posterior(self, x):
        posteriors = []
        for c in self.classes:
            prior = np.log(self.parameters[c]['prior'])
            likelihood = np.sum(np.log(self._calculate_likelihood(c, x)))
            posterior = prior + likelihood
            posteriors.append(posterior)
        return self.classes[np.argmax(posteriors)]

    #predict the class for each instance
    def predict(self, X):
        return [self._calculate_posterior(x) for x in X]

# Load the dataset
df = pd.read_pickle('data/pkl_vector_10k_reviews.pkl')

# Extract only the feature vectors
features = df.vector.to_list()
vector_df = pd.DataFrame(features)

# Prepare features and labels
X = vector_df.values  # Features
y = df.good_rating.values  # Labels

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the classifier
gnb = GaussianNaiveBayes()
gnb.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = gnb.predict(X_test)
accuracy = np.mean(y_pred == y_test)
print(f"Accuracy: {accuracy}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.982

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.98      0.84        96
           1       1.00      0.98      0.99      1904

    accuracy                           0.98      2000
   macro avg       0.87      0.98      0.91      2000
weighted avg       0.99      0.98      0.98      2000

