In [1]:
import pandas as pd
from collections import defaultdict
import numpy as np
from math import log
import re

In [2]:
data = [
    ("Just plan boarding", "negative"),
    ("entirely predictable & lack energy", "negative"),
    ("no surprises & very few laughs", "negative"),
    ("Very powerful", "positive"),
    ("the mist fun films of the summer", "positive"),
    ("Chinese Beijing Chinese", "C"),
    ("Chinese Chinese Shangai", "C"),
    ("Chinese Macao", "C"),
    ("Tokyo japan Chinese", "J"),
]

In [3]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = text.split()
    return words    # spliting the words

class_word_counts = defaultdict(lambda: defaultdict(int))   # empty dict
class_counts = defaultdict(int)
vocabulary = set()   # empty set

In [4]:
for text, label in data:
    words = preprocess(text)  # spliting the words
    class_counts[label] += 1
    for word in words:
        vocabulary.add(word)     #counting of vacabulary for entire dataset
        class_word_counts[label][word] += 1   #count occurence of each word  
        
#print(word)
print(vocabulary)
print(class_word_counts)

total_samples = len(data)
class_priors = {label: count / total_samples for label, count in class_counts.items()}
print(class_priors)   # probability of each class

{'shangai', 'the', 'films', 'fun', 'no', 'powerful', 'laughs', 'boarding', 'lack', 'just', 'of', 'plan', 'summer', 'predictable', 'chinese', 'energy', 'mist', 'macao', 'japan', 'tokyo', 'very', 'few', 'beijing', 'entirely', 'surprises'}
defaultdict(<function <lambda> at 0x000001BEA7337B80>, {'negative': defaultdict(<class 'int'>, {'just': 1, 'plan': 1, 'boarding': 1, 'entirely': 1, 'predictable': 1, 'lack': 1, 'energy': 1, 'no': 1, 'surprises': 1, 'very': 1, 'few': 1, 'laughs': 1}), 'positive': defaultdict(<class 'int'>, {'very': 1, 'powerful': 1, 'the': 2, 'mist': 1, 'fun': 1, 'films': 1, 'of': 1, 'summer': 1}), 'C': defaultdict(<class 'int'>, {'chinese': 5, 'beijing': 1, 'shangai': 1, 'macao': 1}), 'J': defaultdict(<class 'int'>, {'tokyo': 1, 'japan': 1, 'chinese': 1})})
{'negative': 0.3333333333333333, 'positive': 0.2222222222222222, 'C': 0.3333333333333333, 'J': 0.1111111111111111}


In [5]:
word_likelihoods = defaultdict(lambda: defaultdict(float))

for label in class_word_counts:
    for word in vocabulary:
        word_likelihoods[label][word] = (class_word_counts[label][word] + 1) / (class_counts[label] + len(vocabulary)) 
        #  smoothing factor  1 le rhe h 
word_likelihoods    # hr class me hr word ki probabilty

defaultdict(<function __main__.<lambda>()>,
            {'negative': defaultdict(float,
                         {'shangai': 0.03571428571428571,
                          'the': 0.03571428571428571,
                          'films': 0.03571428571428571,
                          'fun': 0.03571428571428571,
                          'no': 0.07142857142857142,
                          'powerful': 0.03571428571428571,
                          'laughs': 0.07142857142857142,
                          'boarding': 0.07142857142857142,
                          'lack': 0.07142857142857142,
                          'just': 0.07142857142857142,
                          'of': 0.03571428571428571,
                          'plan': 0.07142857142857142,
                          'summer': 0.03571428571428571,
                          'predictable': 0.07142857142857142,
                          'chinese': 0.03571428571428571,
                          'energy': 0.07142857142857142,
          

In [6]:
test_data = [
    "predictable with no fun",
    "Chinese Chinese Chinese Tokyo Japan"
]  
nb_preds_scratch=[]   # hum kya predict kr rhe h 
for text in test_data:
    print(text)
    words = preprocess(text)
    max_posterior = -float('inf')
    predicted_label = tuple()
    for label in class_priors:
        posterior = log(class_priors[label])
        for word in words:
            if word in word_likelihoods[label]:
                posterior += log(word_likelihoods[label][word])
        print(label, " : ", posterior)
        if posterior > max_posterior:
            max_posterior = posterior
            predicted_label = label
    print(f"Text: '{text}' - Predicted Label: {predicted_label}"+"\n")
    nb_preds_scratch.append(predicted_label)
    
print("predicted classes are : ",nb_preds_scratch)

predictable with no fun
negative  :  -9.708931458073831
positive  :  -10.698440814229317
C  :  -11.095225819193722
J  :  -11.971514191400665
Text: 'predictable with no fun' - Predicted Label: negative

Chinese Chinese Chinese Tokyo Japan
negative  :  -17.75963483954413
positive  :  -17.98326172679792
C  :  -12.384356431859965
J  :  -15.021971364643901
Text: 'Chinese Chinese Chinese Tokyo Japan' - Predicted Label: C

predicted classes are :  ['negative', 'C']


In [7]:
sentences,labels = zip(*data)
print(sentences)
y_test = ('negative','C')

('Just plan boarding', 'entirely predictable & lack energy', 'no surprises & very few laughs', 'Very powerful', 'the mist fun films of the summer', 'Chinese Beijing Chinese', 'Chinese Chinese Shangai', 'Chinese Macao', 'Tokyo japan Chinese')


In [8]:
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [9]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(sentences)
X_test_vec = vectorizer.transform(test_data)

nb_model = MultinomialNB() #Train Multinomial Naive Bayes model
nb_model.fit(X_train_vec, labels)

nb_preds_sklearn = nb_model.predict(X_test_vec) # Make predictions


accuracy_scratch = accuracy_score(y_test, nb_preds_scratch) # ye scratch se jo predict kiya

accuracy_sklearn = accuracy_score(y_test, nb_preds_sklearn) # ye direct model se predict kiya

print("Accuracy of Scratch Naive Bayes:", accuracy_scratch)
print("Accuracy of scikit-learn Naive Bayes:", accuracy_sklearn)

Accuracy of Scratch Naive Bayes: 1.0
Accuracy of scikit-learn Naive Bayes: 1.0
