**Gaussian Naive Bayes**

In [None]:
# Importing the necessary libraries: numpy, matplotlib, and seaborn for data visualization.
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; 
sns.set()  # Seaborn's default styling for plots.

In [None]:
# `%matplotlib inline` allows the plots to be displayed directly below the code in Jupyter Notebooks.
%matplotlib inline

In [None]:
# Importing make_blobs from sklearn.datasets to generate a synthetic dataset for clustering.
from sklearn.datasets import make_blobs

In [None]:
# Generating a synthetic dataset with 100 samples, each having 2 features (x and y), 
# and dividing the dataset into 2 centers (clusters) with standard deviation of 1.5.
x, y = make_blobs(100, 2, centers=2, cluster_std=1.5)

In [None]:
# Plotting the generated dataset. The 'c=y' argument colors the points according to their class labels.
# 's=50' controls the size of the points, and 'cmap='RdBu'' sets the color map to red and blue.
plt.scatter(x[:, 0], x[:, 1], c=y, s=50, cmap='RdBu')

In [None]:
# Importing the Gaussian Naive Bayes model from sklearn to perform classification.
from sklearn.naive_bayes import GaussianNB

# Creating a Gaussian Naive Bayes classifier instance.
model = GaussianNB()

In [None]:
# Fitting the model on the generated data (x, y), where x is the features and y are the class labels.
model.fit(x, y)

In [None]:
# Creating new data points for visualization using a random number generator.
# These new points will be predicted by the trained Naive Bayes model to visualize decision boundaries.
rng = np.random.RandomState(0)  # Ensures the same random values are generated each time.
x_new = [-6, -14] + [14, 18] * rng.rand(1000, 2)  # Generating 1000 new random data points.

In [None]:
# Predicting the class labels for the new data points using the trained model.
y_new = model.predict(x_new)

In [None]:
# Plotting the original data points again and adding the new points with predicted class labels.
plt.scatter(x[:, 0], x[:, 1], c=y, s=50, cmap='RdBu')
lim = plt.axis()  # Get the current axis limits.
plt.scatter(x_new[:, 0], x_new[:, 1], c=y_new, s=50, cmap='RdBu', alpha=0.2)  # Plot new points with predicted labels.
plt.axis(lim)  # Reset the axis limits.

**Çoklu Naive Bayes**

In [None]:
# Importing the 20 newsgroups dataset, which is a collection of approximately 20 different newsgroups.
from sklearn.datasets import fetch_20newsgroups

In [None]:
data = fetch_20newsgroups

In [None]:
# Selecting the categories (newsgroups) to filter from the dataset.
categories = ['talk.religion.misc', 'soc.religion.christian', 'sci.space', 'comp.graphics']

In [None]:
# Loading the training and testing subsets for the selected categories.
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)

In [None]:
# Printing one example of a document from the training set (document at index 5).
print(train.data[5])

In [None]:
# Importing TfidfVectorizer for text data preprocessing. It converts the raw text into numerical features.
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Importing Multinomial Naive Bayes classifier, commonly used for text classification tasks.
from sklearn.naive_bayes import MultinomialNB

# Importing make_pipeline, which helps create a sequence of data processing steps.
from sklearn.pipeline import make_pipeline

In [None]:
# Creating a pipeline where:
# 1. TfidfVectorizer converts the raw text into a numeric form (TF-IDF features).
# 2. MultinomialNB applies the Naive Bayes algorithm to classify the text into one of the predefined categories.
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [None]:
# Training the model on the training dataset (train.data contains the text, train.target contains the labels).
model.fit(train.data, train.target)

In [None]:
# Making predictions on the test set.
labels = model.predict(test.data)

In [None]:
# Evaluating the model's performance by generating a confusion matrix to compare actual vs predicted labels.
from sklearn.metrics import confusion_matrix

In [None]:
# The confusion matrix compares the true labels (test.target) with the predicted labels (labels).
mat = confusion_matrix(test.target, labels)

In [None]:
# Using Seaborn to visualize the confusion matrix as a heatmap.
# 'annot=True' annotates each cell in the heatmap with its numeric value, 'fmt="d"' specifies integer format.
# 'cbar=False' removes the color bar, which typically shows the scale of the heatmap.
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=train.target_names, yticklabels=train.target_names)

# Adding labels to the axes.
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

In [None]:
# This function predicts the category of a new text (string) using the trained model.
def predict_category(s, train=train, model=model):
    # 'model.predict()' classifies the input text and returns the predicted category index.
    pred = model.predict([s])
    # 'train.target_names' contains the list of category names; we use the predicted index to get the corresponding category.
    return train.target_names[pred[0]]

In [None]:
# Testing the model with some example text inputs.
predict_category('discussing islam vs atheism')  # Should predict one of the categories based on the content.

In [None]:
predict_category('determining the screen resolution')  # Another example to test text classification.