In [1]:
import pandas as pd

df = pd.read_csv("/content/naive_bayes_dataset_1000.csv")

print(df.head())
# dataset contains factors that impact the decision of playing tennis outside or not
## note that the data is generated to be random

  Outlook Temperature Humidity  Windy PlayTennis
0    Rain        Cool   Normal   True         No
1   Sunny        Cool   Normal  False        Yes
2    Rain        Cool     High   True         No
3    Rain        Cool   Normal  False         No
4   Sunny         Hot     High   True        Yes


In [2]:
# first, we need to encode the categorical variables in our dataset to numbers
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

categorical_features = ['Outlook', 'Temperature', 'Humidity', 'Windy']
for col in categorical_features:
  df[col] = encoder.fit_transform(df[col])

df['PlayTennis'] = encoder.fit_transform(df['PlayTennis'])

print(df.head())

   Outlook  Temperature  Humidity  Windy  PlayTennis
0        1            0         1      1           0
1        2            0         1      0           1
2        1            0         0      1           0
3        1            0         1      0           0
4        2            1         0      1           1


In [3]:
# split data into features (X) and target (y)
X = df[['Outlook', 'Temperature', 'Humidity', 'Windy']]
y = df['PlayTennis']

# split into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Naive Bayes — Plain Language Overview

Naive Bayes is a simple but powerful classification method that uses probability to predict which class a new example belongs to. It looks at how often features occur in each class and uses that information to make a decision.

---

## Core Idea

- The model learns:
  - How common each class is  
  - How common each feature value is **within** each class  
- For a new input, it calculates which class is **most likely** based on those learned probabilities.

It is called *naive* because it assumes that all features are independent of each other — an assumption that is rarely true but still works surprisingly well in practice.

---

## How It Works (Step-by-Step in Plain Language)

1. Count how often each class appears in the dataset.  
2. Count how often each feature value appears within each class.  
3. Convert those counts into probabilities.  
4. For a new example, multiply the feature probabilities for each class.  
5. Select the class with the highest resulting probability score.

---

## Types of Naive Bayes

- **Gaussian Naive Bayes**  
  For continuous numeric features (assumes a bell-curve distribution).

- **Multinomial Naive Bayes**  
  For count-based data, especially text (word counts, TF-IDF).

- **Bernoulli Naive Bayes**  
  For binary yes/no or 0/1 features.

---

## Strengths

- Extremely fast to train  
- Fast predictions  
- Works well with many features  
- Excellent for text classification  
- Performs well even with limited data  

---

## Limitations

- Assumes features don’t depend on each other  
- Not ideal for datasets with strong feature correlations  
- Less effective for complex nonlinear relationships  

---



In [4]:
# now, determine which Naive Bayes classifier we should use for this dataset
# options:
  # GaussianNB: assumes features are continuous and normally distributed (like height, weight, temperature)
  # MultinomialNB: assumes features are counts or frequencies (like word counts in text classification) or integer-encoded categories
  # BernoulliNB: assumes features are binary
# we should use MultinomialNB for this data since we have integer-encoded categories

from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()

nb_model.fit(X_train, y_train)

In [5]:
# make predictions using NB model
y_pred = nb_model.predict(X_test)

In [7]:
# compute metrics to see how model did
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# accuracy: tells what % of correct predictions
acc = accuracy_score(y_pred, y_test) * 100
print("Accuracy: ", acc, "%")

# confusion matrix: correctly labelled datapoints are top left and bottom right (true negative, true positive)
cm = confusion_matrix(y_pred, y_test)
print("\nConfusion matrix:\n", cm)

## classification report: shows precision, recall, f1-score, and support metrics
# precision: when my model predicts label 1 for ex, how often is it right? TP / (TP + FP)
# recall: of all the actual label 1s, how many did my model correctly label as 1? TP / (TP + FN)
# f1-score: how balanced is my model of correctly labelling TP and avoiding false alarms? (2 * ((precision * recall) / (precision + recall)))
# support: number of true predictions for each class
cr = classification_report(y_pred, y_test)
print("\nClassification report:\n", cr)

Accuracy:  46.0 %

Confusion matrix:
 [[16 18]
 [90 76]]

Classification report:
               precision    recall  f1-score   support

           0       0.15      0.47      0.23        34
           1       0.81      0.46      0.58       166

    accuracy                           0.46       200
   macro avg       0.48      0.46      0.41       200
weighted avg       0.70      0.46      0.52       200

