In [20]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

In [12]:
from ipynb.fs.full.NaiveBayes import GaussianNaiveBayes, MultinomialNaiveBayes, BernoulliNaiveBayes

In [13]:
# Generate synthetic dataset
X, y = make_classification(n_samples=500, n_features=5, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Gaussian Naïve Bayes
gnb = GaussianNaiveBayes()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print("Gaussian Naïve Bayes Predictions:", y_pred[:10])
print("Gaussian Naïve Bayes Accuracy:", accuracy)


Gaussian Naïve Bayes Predictions: [0 1 0 1 0 1 1 0 0 1]
Gaussian Naïve Bayes Accuracy: 0.92


# Real World Data

In [43]:
df = pd.read_csv("../../data/spam.csv", encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [44]:
df = df[['v1', 'v2']] 
df.columns = ['label', 'message'] 
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [45]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [46]:
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

In [47]:
# Convert text data to numerical features using CountVectorizer
vectorizer = CountVectorizer(binary=False)  # Word frequency representation
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

In [48]:
# Train and Evaluate Multinomial Naïve Bayes
mnb = MultinomialNaiveBayes()
mnb.fit(X_train_counts.toarray(), y_train.values)
y_pred_mnb = mnb.predict(X_test_counts.toarray())

In [49]:
accuracy_mnb = accuracy_score(y_test, y_pred_mnb)
print(f"✅ Multinomial Naïve Bayes Accuracy: {accuracy_mnb:.4f}")

✅ Multinomial Naïve Bayes Accuracy: 0.9839


In [50]:
# Convert text data to binary features using CountVectorizer (For Bernoulli NB)
vectorizer_binary = CountVectorizer(binary=True)  # Binary presence representation
X_train_bin = vectorizer_binary.fit_transform(X_train)
X_test_bin = vectorizer_binary.transform(X_test)


In [51]:
bnb = BernoulliNaiveBayes()
bnb.fit(X_train_bin.toarray(), y_train.values)
y_pred_bnb = bnb.predict(X_test_bin.toarray())


In [52]:
accuracy_bnb = accuracy_score(y_test, y_pred_bnb)
print(f"✅ Bernoulli Naïve Bayes Accuracy: {accuracy_bnb:.4f}")

✅ Bernoulli Naïve Bayes Accuracy: 0.9749
