In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import confusion_matrix

In [4]:
data = fetch_20newsgroups()
categories = data.target_names
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)

In [5]:
# Create a set of all unique words in the training data
vocab_set = set()
for dstr in train.data:
    words = dstr.lower().split()
    vocab_set.update(words)

# Convert the set of unique words to a list
vocab_list = list(vocab_set)

In [37]:
# Initialize counts for each word in each category
word_counts = np.ones((len(categories), len(vocab_list)))  # Laplace smoothing with np.ones()

# Calculate the prior probabilities for each category
def prior_prob(categories):
    category_counts = np.zeros(len(categories))

    for i in range(len(train.data)):
        GT_category = train.target_names[train.target[i]]

        for j, s in enumerate(categories):
            if GT_category == s:
                category_counts[j] += 1

    total_samples = len(train.data)
    category_prob = category_counts / total_samples

    return category_prob

category_prob = prior_prob(categories)
print("Prior Probabilities:")
print(category_prob)

Prior Probabilities:
[0.04242531 0.05161747 0.05223617 0.05214778 0.05108715 0.05241294
 0.05170585 0.05250133 0.05285487 0.05276648 0.05303164 0.05258971
 0.05223617 0.05250133 0.05241294 0.05294326 0.04825879 0.04984974
 0.04109952 0.03332155]


In [38]:
def compute_likelihood(data, vocab_list, categories):
    num_categories = len(categories)
    num_words = len(vocab_list)

    # Initialize counts for each word in each category with Laplace smoothing
    word_counts = np.ones((num_categories, num_words))

    # Count occurrences of each word in each category
    for i in range(len(data)):
        category_index = categories.index(train.target_names[train.target[i]])
        words = data[i].lower().split()
        for word in words:
            if word in vocab_list:
                word_index = vocab_list.index(word)
                word_counts[category_index, word_index] += 1

    # Convert counts to probabilities
    likelihood_matrix = word_counts / np.sum(word_counts, axis=1, keepdims=True)

    return likelihood_matrix

# Example usage
likelihood_matrix = compute_likelihood(train.data, vocab_list, categories)
print("Likelihood Matrix:")
print(likelihood_matrix)
