In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
X = newsgroups.data
y = newsgroups.target

# Convert the raw text data into TF-IDF features
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [3]:
# Create a Naive Bayes classifier (Multinomial Naive Bayes for text classification)
naive_bayes = MultinomialNB()

# Train the classifier on the training data
naive_bayes.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = naive_bayes.predict(X_test)

In [4]:
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6551724137931034


In [5]:
# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Plot the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.12      0.21       151
           1       0.72      0.61      0.66       202
           2       0.67      0.59      0.63       195
           3       0.51      0.78      0.62       183
           4       0.89      0.63      0.74       205
           5       0.88      0.81      0.85       215
           6       0.86      0.60      0.71       193
           7       0.85      0.72      0.78       196
           8       0.51      0.75      0.61       168
           9       0.96      0.77      0.85       211
          10       0.87      0.88      0.88       198
          11       0.62      0.83      0.71       201
          12       0.86      0.55      0.67       202
          13       0.89      0.69      0.78       194
          14       0.82      0.74      0.77       189
          15       0.25      0.96      0.40       202
          16       0.76      0.74      0.75       188
    