In [7]:
import re
from sklearn.datasets import fetch_20newsgroups
from typing import List, Tuple, Dict, Iterable
import math
from collections import defaultdict
import numpy as np

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
train_data = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

dataset = fetch_20newsgroups(subset='all', categories=categories)

def tokenize(text):
    text = text.lower()
    all_words = re.findall("[a-z0-9']+", text)
    return set(all_words)

In [8]:
class NaiveBayesClassifier:
    def __init__(self, k = 0.5):
        self.k = k
        self.tokens = set()
        self.token_atheism_counts = {}
        self.token_graphics_counts = {}
        self.token_med_counts = {}
        self.token_christian_counts = {}
        self.atheism_messages = self.graphics_messages = self.med_messages = self.christian_messages = 0

    def train(self, dataset):
        for i in range(len(dataset.data)):
            if dataset.target[i] == 0:
                self.atheism_messages += 1
            elif dataset.target[i] == 1:
                self.graphics_messages += 1
            elif dataset.target[i] == 2:
                self.med_messages += 1
            elif dataset.target[i] == 3:
                self.christian_messages += 1

            for token in tokenize(dataset.data[i]):
                self.tokens.add(token)
                if dataset.target[i] == 0 and token not in self.token_atheism_counts:
                 self.token_atheism_counts[token] = 1
                elif dataset.target[i] == 0 and token in self.token_atheism_counts:
                 self.token_atheism_counts[token] += 1
                elif dataset.target[i] == 1 and token not in self.token_graphics_counts:
                  self.token_graphics_counts[token] = 1
                elif dataset.target[i] == 1 and token in self.token_graphics_counts:
                  self.token_graphics_counts[token] += 1
                elif dataset.target[i] == 2 and token not in self.token_med_counts:
                  self.token_med_counts[token] = 1
                elif dataset.target[i] == 2 and token in self.token_med_counts:
                  self.token_med_counts[token] += 1
                elif dataset.target[i] == 3 and token not in self.token_christian_counts:
                  self.token_christian_counts[token] = 1
                elif dataset.target[i] == 3 and token in self.token_christian_counts:
                  self.token_christian_counts[token] += 1

    def _probabilities(self, token):
        if token in self.token_atheism_counts:
            atheism = self.token_atheism_counts[token]
        else:
           atheism = 0
        if token in self.token_graphics_counts:
            graphics = self.token_graphics_counts[token]
        else:
           graphics = 0
        if token in self.token_med_counts:
            med = self.token_med_counts[token]
        else:
           med = 0
        if token in self.token_christian_counts:
            christian = self.token_christian_counts[token]
        else:
           christian = 0

        p_token_atheism = (atheism + self.k) / (self.atheism_messages + 2*self.k)
        p_token_graphics = (graphics + self.k) / (self.graphics_messages + 2*self.k)
        p_token_med = (med + self.k) / (self.med_messages + 2*self.k)
        p_token_christian = (christian + self.k) / (self.christian_messages + 2*self.k)
        return p_token_atheism, p_token_graphics, p_token_med, p_token_christian
    
    def predict(self, text):
        target=[]
        for i in range(len(text.data)):
           target.append(text.target[i])
        
        list_of_classes = []

        for i in range(len(text.data)):
            text_tokens = tokenize(text.data[i])
            log_prob_if_atheism = log_prob_if_graphics = log_prob_if_med = log_prob_if_christian = 0.0

            for token in text_tokens:
                    prob_if_atheism, prob_if_graphics, prob_if_med, prob_if_christian = self._probabilities(token)
                    log_prob_if_atheism += math.log(prob_if_atheism)
                    log_prob_if_graphics += math.log(prob_if_graphics)
                    log_prob_if_med += math.log(prob_if_med)
                    log_prob_if_christian += math.log(prob_if_christian)

            max_prop = max(log_prob_if_atheism,log_prob_if_graphics,log_prob_if_med,log_prob_if_christian)

            if max_prop == log_prob_if_atheism:
               list_of_classes.append(0)
            if max_prop == log_prob_if_graphics:
               list_of_classes.append(1)
            if max_prop == log_prob_if_med:
               list_of_classes.append(2)
            if max_prop == log_prob_if_christian:
               list_of_classes.append(3)
        
        list_of_classes = np.array(list_of_classes)
        target = np.array(target)
        return list_of_classes, target
        
model = NaiveBayesClassifier(k=0.5)
model.train(train_data)

In [9]:
testing = model.predict(test_data)
arr1 = testing[0]
arr2 = testing[1]
arr3 = arr1==arr2
testvalue = sum(arr3)/len(arr3)
testvalue

0.8901464713715047