In [None]:
import warnings
warnings.filterwarnings('ignore')

**Precision, Recall, F1-score Formula:**

The F1-score is the harmonic mean of precision and recall:

$$
F1 = 2 \times \frac{Precision \times Recall}{Precision + Recall}
$$

Where:
$$
Precision = \frac{TP}{TP + FP}
$$

$$
Recall = \frac{TP}{TP + FN}
$$


\( TP \) = True Positives  
\( FP \) = False Positives  
\( FN \) = False Negatives  

# Categorical Naive Bayes

| Rec. ID | Age    | Income | Student | Credit_Rating | Buy_Computer |
|---------|--------|--------|---------|---------------|--------------|
| 1       | Young  | High   | No      | Fair          | No           |
| 2       | Young  | High   | No      | Excellent     | No           |
| 3       | Medium | High   | No      | Fair          | Yes          |
| 4       | Old    | Medium | No      | Fair          | Yes          |
| 5       | Old    | Low    | Yes     | Fair          | Yes          |
| 6       | Old    | Low    | Yes     | Excellent     | No           |
| 7       | Medium | Low    | Yes     | Excellent     | Yes          |
| 8       | Young  | Medium | No      | Fair          | No           |
| 9       | Young  | Low    | Yes     | Fair          | Yes          |
| 10      | Old    | Medium | Yes     | Fair          | Yes          |
| 11      | Young  | Medium | Yes     | Excellent     | Yes          |
| 12      | Medium | Medium | No      | Excellent     | Yes          |
| 13      | Medium | High   | Yes     | Fair          | Yes          |
| 14      | Old    | Medium | No      | Excellent     | No           |

Given the features
- Age
- Income
- Student
- Credit_Rating

Predict
- Buy_Computer

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score


data = {
    'Age': ['Young', 'Young', 'Medium', 'Old', 'Old', 'Old', 'Medium', 'Young', 'Young', 'Old', 'Young', 'Medium', 'Medium', 'Old'],
    'Income': ['High', 'High', 'High', 'Medium', 'Low', 'Low', 'Low', 'Medium', 'Low', 'Medium', 'Medium', 'Medium', 'High', 'Medium'],
    'Student': ['No', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No'],
    'Credit_Rating': ['Fair', 'Excellent', 'Fair', 'Fair', 'Fair', 'Excellent', 'Excellent', 'Fair', 'Fair', 'Fair', 'Excellent', 'Excellent', 'Fair', 'Excellent'],
    'Buy_Computer': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}
df = pd.DataFrame(data)

encoder = OrdinalEncoder(categories=[['Young', 'Medium', 'Old'], ['Low', 'Medium', 'High'], ['No', 'Yes'], ['Fair', 'Excellent']])
X = encoder.fit_transform(df[['Age', 'Income', 'Student', 'Credit_Rating']])
y = df['Buy_Computer'].map({'No': 0, 'Yes': 1})

for features, target in zip(X, y):
    print(f"Features: {features}, Target: {target}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = CategoricalNB()
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training Accuracy: {train_accuracy * 100:.2f}%')

y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Testing Accuracy: {test_accuracy * 100:.2f}%')

# Predict for a new sample
new_data = encoder.transform([['Young', 'Medium', 'Yes', 'Fair']])
prediction = model.predict(new_data)
print(f"Prediction: {'Yes' if prediction[0] == 1 else 'No'}")

# Gaussian Naive Bayes

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report


iris = load_iris()
X = iris.data  # Features: sepal length, sepal width, petal length, petal width
y = iris.target  # Target: 0 = setosa, 1 = versicolour, 2 = virginica

for features, target in zip(X, y):
    print("Features:")
    print(f"-sepal length: {features[0]}")
    print(f"-sepal width: {features[1]}")
    print(f"-petal length: {features[2]}")
    print(f"-petal width: {features[3]}")
    print(f"Target: {target} - {iris.target_names[target]}")
    break

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = GaussianNB()
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training Accuracy: {train_accuracy * 100:.2f}%')

y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Testing Accuracy: {test_accuracy * 100:.2f}%')
print("Classification Report:\n", classification_report(y_test, y_test_pred, target_names=iris.target_names))

# Predict for a new sample
new_data = [[4.9, 3.5, 1.6, 0.2]]  # Example new data
prediction = model.predict(new_data)
print(f"Prediction: {iris.target_names[prediction[0]]}")

# Multinomial Naive Bayes

**Laplace Smoothing Formula:**

Laplace smoothing is used in Naïve Bayes classification to handle zero probabilities by adding a small value (typically 1) to word counts.

$$
P(w_i | C) = \frac{count(w_i, C) + \alpha}{\sum count(w_j, C) + \alpha \times |V|}
$$

Where:
- \( P(w_i | C) \) = Probability of word \( w_i \) given class \( C \)
- \( count(w_i, C) \) = Number of times word \( w_i \) appears in class \( C \)
- \( \alpha \) = Smoothing parameter (commonly set to 1 for Laplace smoothing)
- \( |V| \) = Vocabulary size (total unique words)
- \( \sum count(w_j, C) \) = Total count of words in class \( C \)

Laplace smoothing ensures that every word has a nonzero probability, even if it doesn't appear in the training data. This prevents probability multiplication from resulting in zero, which could otherwise break classification.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


texts = [
    # Spam messages
    "Congratulations! You've won a $1000 Walmart gift card. Go to http://bit.ly/123456 to claim now.",  # spam
    "URGENT! Your mobile number has won £2000 cash! Call 09061701461 now.",  # spam
    "Free entry in 2 a weekly competition to win FA Cup final tkts. Text FA to 87121 to receive entry question(std txt rate)",  # spam
    "You have been selected to receive a free iPhone. Click here to claim: http://freeiphone.com",  # spam
    "Win a guaranteed $1000 cash or a $2000 prize. To claim, call 09050000327.",  # spam
    "Get Viagra now at a discount! No prescription needed. Visit http://meds4you.com",  # spam
    "Exclusive offer! Buy 1 get 1 free on all items. Shop now at http://shopnow.com",  # spam
    "You have a new voicemail. Call 1234567890 to listen.",  # spam
    "Your account has been compromised. Reset your password at http://secure-login.com",  # spam
    "Earn $5000 per week from home. Ask me how!",  # spam

    # Ham messages
    "Hey, are we still meeting for dinner tonight?",  # ham
    "Don't forget to bring your umbrella. It's going to rain.",  # ham
    "Can you pick up some milk on your way home?",  # ham
    "Happy Birthday! Hope you have a great day!",  # ham
    "I'll call you when I get off work.",  # ham
    "Just finished my workout. Feeling great!",  # ham
    "Let's catch up soon. It's been a while!",  # ham
    "I'm running late. Be there in 10 minutes.",  # ham
    "Thanks for your help earlier. I really appreciate it.",  # ham
    "Good luck on your exam tomorrow!",  # ham
]
labels = ["spam"] * 10 + ["ham"] * 10

In [None]:
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42)

print(X_train[0])
print(y_train[0])

In [None]:
model = make_pipeline(CountVectorizer(), MultinomialNB(alpha=1.0, fit_prior=True))
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training Accuracy: {train_accuracy * 100:.2f}%')

y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Testing Accuracy: {test_accuracy * 100:.2f}%')
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))

In [None]:
model = make_pipeline(TfidfVectorizer(), MultinomialNB(alpha=1.0, fit_prior=True))
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Training Accuracy: {train_accuracy * 100:.2f}%')

y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Testing Accuracy: {test_accuracy * 100:.2f}%')
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))