In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score

# Load the 20 newsgroups dataset
data = fetch_20newsgroups()
categories = data.target_names
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)

# Create a set of all unique words in the training data
vocab_set = set()
for dstr in train.data:
    words = dstr.lower().split()
    vocab_set.update(words)

# Convert the set of unique words to a list
vocab_list = list(vocab_set)

# Calculate the prior probabilities for each category
def prior_prob(categories):
    category_counts = np.zeros(len(categories))

    for i in range(len(train.data)):
        GT_category = train.target_names[train.target[i]]

        for j, s in enumerate(categories):
            if GT_category == s:
                category_counts[j] += 1

    total_samples = len(train.data)
    category_probabilities = category_counts / total_samples

    return category_probabilities.tolist()  # Convert to Python list

category_probabilities = prior_prob(categories)
print("Prior Probabilities:")
print(category_probabilities)

# Calculate the likelihood for each word given the category
def likelihood(word, category, train_data, train_targets):
    word_count_in_category = 0
    total_words_in_category = 0

    for i in range(len(train_data)):
        if train_targets[i] == categories.index(category):
            total_words_in_category += len(train_data[i].split())
            word_count_in_category += train_data[i].lower().split().count(word)

    # Laplace smoothing to handle unseen words
    alpha = 1
    likelihood_prob = (word_count_in_category + alpha) / (total_words_in_category + alpha * len(vocab_list))

    return likelihood_prob

# Calculate the posterior probability for a given document and category
def posterior_prob(doc, category, train_data, train_targets, prior_probs):
    likelihoods = [likelihood(word, category, train_data, train_targets) for word in doc.lower().split()]
    log_likelihood_sum = np.sum(np.log(likelihoods))
    log_posterior = np.log(prior_probs[categories.index(category)]) + log_likelihood_sum
    return np.exp(log_posterior)

# Find the category with the highest posterior probability for a given document
def predict_category(doc, categories, train_data, train_targets, prior_probs):
    posterior_probs = [posterior_prob(doc, category, train_data, train_targets, prior_probs) for category in categories]
    predicted_category_index = np.argmax(posterior_probs)
    return categories[predicted_category_index]

Prior Probabilities:
[0.04242531377054976, 0.05161746508750221, 0.0522361675799894, 0.05214778150963408, 0.05108714866537034, 0.05241293972070002, 0.05170585115785752, 0.05250132579105533, 0.05285487007247658, 0.052766484002121264, 0.0530316422131872, 0.05258971186141064, 0.0522361675799894, 0.05250132579105533, 0.05241293972070002, 0.052943256142831886, 0.048258794414000356, 0.04984974368039597, 0.041099522715220084, 0.033321548523952624]


In [2]:
# Example: Predict the category for a random document from the test set
random_test_doc_index = np.random.randint(0, len(test.data))
random_test_doc = test.data[random_test_doc_index]
true_category = test.target_names[test.target[random_test_doc_index]]

predicted_category = predict_category(random_test_doc, categories, train.data, train.target, category_probabilities)

print("Random Test Document:")
print(random_test_doc)
print(f"\nTrue Category: {true_category}")
print(f"Predicted Category: {predicted_category}")

# Measure accuracy on the test set
test_predictions = [predict_category(doc, categories, train.data, train.target, category_probabilities) for doc in test.data]
test_accuracy = accuracy_score(test.target, test_predictions)
print(f"\nTest Accuracy: {test_accuracy}")


Random Test Document:
From: gdnikoli@undergrad.math.uwaterloo.ca (Greg Nikolic)
Subject: Re: Who's next? Mormons and Jews?
Organization: University of Waterloo
Lines: 17

In article <C5s5n0.DyJ@world.std.com> rjk@world.std.com (Robert J. Kolker) writes:
>take their oath at the fortress. Lo Tepol Shaynit Matzadah. Matzadah will
>not fall again!

     These zealots. Holy fuck.

     Israel. Armenia. Turkey. Greece. Croatia. Serbia. Bosnia. Russia. Germany.
Iran. The Arab World.

     War.


-- 
     "Please allow me to introduce myself.               SYMPATHY 
      I'm a man of wealth and taste.                   FOR THE DEVIL
      I've been around for long, long years.            the Laibach  
      Stolen many a man's soul, and faith."               remixes


True Category: talk.politics.guns
Predicted Category: alt.atheism
