# Without using Sklearn

In [1]:
from collections import defaultdict
import math

In [9]:
class NaiveBayesClassifier:
    def __init__(self):
        self.class_probs = {}
        self.feature_probs = {}
        self.labels = set()

    def train(self, X, y):
        total = len(y)
        label_counts = defaultdict(int)
        feature_counts = defaultdict(lambda: defaultdict(int))
        self.labels = set(y)

        for features, label in zip(X, y):
            label_counts[label] += 1
            for i, value in enumerate(features):
                feature_counts[(i, label)][value] += 1

        # Prior probabilities P(class)
        self.class_probs = {label: count / total for label, count in label_counts.items()}

        # Conditional probabilities P(feature=value | class)
        self.feature_probs = {}
        for (i, label), val_counts in feature_counts.items():
            total_label_count = label_counts[label]
            self.feature_probs[(i, label)] = {
                val: count / total_label_count for val, count in val_counts.items()
            }

    def predict(self, features):
        results = {}
        for label in self.labels:
            log_prob = math.log(self.class_probs[label])
            for i, value in enumerate(features):
                prob = self.feature_probs.get((i, label), {}).get(value, 1e-6)  # Smoothing
                log_prob += math.log(prob)
            results[label] = log_prob
        return max(results, key=results.get)


In [20]:
# Define training data
X = [
    ['Rainy', 'Hot', 'High', False],
    ['Rainy', 'Hot', 'High', True],
    ['Overcast', 'Hot', 'High', False],
    ['Sunny', 'Mild', 'High', False],
    ['Sunny', 'Cool', 'Normal', False],
    ['Sunny', 'Cool', 'Normal', True],
    ['Overcast', 'Cool', 'Normal', True],
    ['Rainy', 'Mild', 'High', False],
    ['Rainy', 'Cool', 'Normal', False],
    ['Sunny', 'Mild', 'Normal', False],
    ['Rainy', 'Mild', 'Normal', True],
    ['Overcast', 'Mild', 'High', True],
    ['Overcast', 'Hot', 'Normal', False],
    ['Sunny', 'Mild', 'High', True]
]

In [21]:
y = [
    'No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes',
    'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No'
]

In [22]:
# Train the model
nb = NaiveBayesClassifier()
nb.train(X, y)

# Predict on new data
test_instance = ['Overcast', 'Hot', 'Normal', False]
prediction = nb.predict(test_instance)
print(f"Prediction for {test_instance}: {prediction}")

Prediction for ['Overcast', 'Hot', 'Normal', False]: Yes


# Using sklearn

In [25]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import LabelEncoder
import numpy as np



# Transpose X to encode column-wise
X_array = np.array(X)
encoders = []

# Encode each column of X
X_encoded = []
for i in range(X_array.shape[1]):
    le = LabelEncoder()
    col = X_array[:, i]
    X_encoded.append(le.fit_transform(col))
    encoders.append(le)

# Combine all encoded columns back into shape (n_samples, n_features)
X_encoded = np.array(X_encoded).T

# Encode labels y
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train Naive Bayes model
model = CategoricalNB()
model.fit(X_encoded, y_encoded)

# Example prediction: ['Sunny', 'Cool', 'High', True]
test_instance = ['Overcast', 'Hot', 'Normal', False]
test_encoded = [
    encoders[0].transform([test_instance[0]])[0],
    encoders[1].transform([test_instance[1]])[0],
    encoders[2].transform([test_instance[2]])[0],
    encoders[3].transform([test_instance[3]])[0],
]

# Predict
predicted = model.predict([test_encoded])
predicted_label = label_encoder.inverse_transform(predicted)[0]
print(f"Prediction for {test_instance}: {predicted_label}")


Prediction for ['Overcast', 'Hot', 'Normal', False]: Yes
