In [16]:
from collections import defaultdict, Counter
import math
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# Naive Bayes

Naïve Bayes is a probabilistic machine learning algorithm based on Bayes' Theorem. It is called "naïve" because it assumes that the features are independent of each other, which is often not true in real-world data. Despite this simplification, it works well for many tasks, especially text classification.

$$
P(A \mid B) = \frac{P(B \mid A) \cdot P(A)}{P(B)}
$$

$P(A)$: The prior probability of \(A\), i.e., the probability of \(A\) before observing any evidence.

$P(B)$: The probability of \(B\), often calculated as:

$$
P(B) = \sum_{i} P(B \mid A_i) \cdot P(A_i)
$$

$P(A \mid B)$: The probability of event \(A\) (the hypothesis) given that \(B\) (the evidence) has occurred. This is the posterior probability.

$P(B \mid A)$: The probability of observing \(B\) given that \(A\) is true. This is the likelihood.

Because this approach is so simple, its quite fast and might fit simple text classification tasks.

In [17]:
docs = [
    "I love coding",     
    "coding is fun", 
    "I hate bugs",       
    "debugging is hard", 
]
labels = ["Positive", "Positive", "Negative", "Negative"]

In [18]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(docs)

In [19]:
model = MultinomialNB()
model.fit(X_train, labels)

In [20]:
test_sentence = ["I hate bugs"]
X_test = vectorizer.transform(test_sentence)
predicted_class = model.predict(X_test)[0]

print(f"The sentence '{test_sentence[0]}' is predicted to be: {predicted_class}")

The sentence 'I hate bugs' is predicted to be: Negative


## Count Vectorizer

In [21]:
texts = ["I love coding", "I love Python", "Python is great"]
vectorizer = CountVectorizer()
X_vectors = vectorizer.fit_transform(texts)

print("\nFeature names (vocabulary):")
print(vectorizer.get_feature_names_out())

print("Dense matrix:")
print(X_vectors.toarray())


Feature names (vocabulary):
['coding' 'great' 'is' 'love' 'python']
Dense matrix:
[[1 0 0 1 0]
 [0 0 0 1 1]
 [0 1 1 0 1]]


Feature names are the words in the vocabulary. Each has an index. 

Please note, the word "I" is not there. Why? Because CountVectorizer removes stop words like I 

The dense matrix represents wether the word exists in the sentence. For example in the first sentence we have:
I love coding. These are indexes:
- I is not part of the index
- love is index 3
- coding is index 0

Therefor in the first row I expect to see 1 in index 0 and 3. Zeros in all the rest. This **IS** what I get.

In [22]:
data = [
    ("Discover the best hiking trails", "Not Spam"),
    ("Win a trip to the Amazon jungle", "Spam"),
    ("Experience the beauty of forests", "Not Spam"),
    ("Exclusive safari deal for you", "Spam"),
    ("Save the whales, donate today", "Not Spam"),
    ("Free camping gear with purchase", "Spam")
]

messages, labels = zip(*data)

pipeline = make_pipeline(CountVectorizer(), MultinomialNB())

pipeline.fit(messages, labels)

In [23]:
test_message = ["Free hiking gear for you"]
predicted_class = pipeline.predict(test_message)
predicted_proba = pipeline.predict_proba(test_message)

print(f"Message: {test_message[0]}")
print(f"Predicted Class: {predicted_class[0]}")
print("Predicted Probabilities:", dict(zip(pipeline.classes_, predicted_proba[0])))

Message: Free hiking gear for you
Predicted Class: Spam
Predicted Probabilities: {'Not Spam': 0.1229815214288261, 'Spam': 0.8770184785711737}



# Naive Bayes Text Classification with a Nature Theme

### Step 1: Data Preparation

#### Training Data
| Message                             | Class    |
|-------------------------------------|----------|
| "Discover the best hiking trails"   | Not Spam |
| "Win a trip to the Amazon jungle"   | Spam     |
| "Experience the beauty of forests"  | Not Spam |
| "Exclusive safari deal for you"     | Spam     |
| "Save the whales, donate today"     | Not Spam |
| "Free camping gear with purchase"   | Spam     |


### Vocabulary
The unique words across all messages form the vocabulary:

In [24]:
bag_of_words = ['discover', 'the', 'best', 'hiking', 'trails', 'win', 'a', 'trip', 'to',
                'amazon', 'jungle', 'experience', 'beauty', 'of', 'forests', 'exclusive', 'safari',
                'deal', 'for', 'you', 'save', 'whales', 'donate', 'today', 'free', 'camping', 'gear',
                'with', 'purchase']

We use this vocabulary to calculate word probabilities for each class.

### Step 2: Word Frequency Calculation

In [25]:
spam_words = ['win', 'a', 'trip', 'to', 'amazon', 'jungle', 'exclusive', 'safari',
        'deal', 'for', 'you', 'free', 'camping', 'gear', 'with', 'purchase']

In [26]:
non_spam_words = ['discover', 'the', 'best', 'hiking', 'trails', 'experience', 'the', 'beauty', 'of', 'forests',
                'save', 'the', 'whales', 'donate', 'today']

In [27]:
len(non_spam_words)

15


### Step 3: Laplace Smoothing

To handle words that may not appear in a specific class, we apply Laplace smoothing.  
The formula is:

$$ P(\text{Word} \mid \text{Class}) = \frac{\text{Word Count in Class} + 1}{\text{Total Words in Class} + \text{Vocabulary Size}} $$

**Vocabulary Size = 28** (total unique words).

#### Example Calculations:
- For the word "win" in Spam:
$$ P(\text{win} \mid \text{Spam}) = \frac{1 + 1}{16 + 28} = \frac{2}{44} \approx 0.0455 $$

- For the word "forests" in Spam (not present):
$$ P(\text{forests} \mid \text{Spam}) = \frac{0 + 1}{16 + 28} = \frac{1}{44} \approx 0.0227 $$



### Step 4: Prior Probabilities

The prior probabilities are based on the class distribution in the dataset.

- **Spam**:
$$ P(\text{Spam}) = \frac{\text{Spam Messages}}{\text{Total Messages}} = \frac{3}{6} = 0.5 $$

- **Not Spam**:
$$ P(\text{Not Spam}) = \frac{\text{Not Spam Messages}}{\text{Total Messages}} = \frac{3}{6} = 0.5 $$



### Step 5: Classifying a New Message

Let’s classify the message: `"free hiking gear for you"`.

We calculate the posterior probabilities for both Spam and Not Spam using Bayes’ theorem:

$$ P(\text{Spam} \mid \text{Message}) \propto P(\text{Spam}) \prod_{\text{Wor d} \in \text{Message}} P(\text{Word} \mid \text{Spam}) $$

$$ P(\text{Not Spam} \mid \text{Message}) \propto P(\text{Not Spam}) \prod_{\text{Word} \in \text{Message}} P(\text{Word} \mid \text{Not Spam}) $$

Next, we compute these probabilities step-by-step.


Weird signs explanation:

The chance of the message being spam, is proportional to the chance of it being spam (with no regards to the message itself) times, times the probability of it being spam times each word's probability of being spam

Basically you take the chance of something being spam or not spam, unrelated to the message it self, lets say the chance of being spam is 0.3.

You take each word, calculate its chance of being spam, and multiply by 0.3. You sum this over all the words in the message, and you get the chance of it being spam.

In [28]:
data = [
    ("Discover the best hiking trails", "Not Spam"),
    ("Win a trip to the Amazon jungle", "Spam"),
    ("Experience the beauty of forests", "Not Spam"),
    ("Exclusive safari deal for you", "Spam"),
    ("Save the whales, donate today", "Not Spam"),
    ("Free camping gear with purchase", "Spam")
]

## Exercise

Build your own Naive Bayes model from scratch

Instructions:

1. Tokenize the data in what ever way you see fit
2. Create a data structure that for each class, holds the probability of each word in that class. For example:
- in the class spam: free: 0.3, commit: 0.03 (made up numbers)
3. Given a sentence, calculate its probability for each class and choose the higher probability as the prediction

Once its ready, use it to classify the IMDB dataset

### Solution

In [29]:
def tokenize(message):
    return message.lower().split()

In [30]:
vocabulary = set()
class_word_counts = defaultdict(Counter)
class_counts = Counter()

In [31]:
for message, label in data:
    tokens = tokenize(message)
    vocabulary.update(tokens)
    class_word_counts[label].update(tokens)
    class_counts[label] += 1

In [32]:
vocabulary = sorted(vocabulary)
vocab_size = len(vocabulary)

In [33]:
total_messages = sum(class_counts.values())
priors = {cls: count / total_messages for cls, count in class_counts.items()}

In [34]:
word_likelihoods = {}
for cls, word_count in class_word_counts.items():
    total_words_in_class = sum(word_count.values())
    word_likelihoods[cls] = {
        word: (word_count[word] + 1) / (total_words_in_class + vocab_size)
        for word in vocabulary}

In [35]:
word_likelihoods

{'Not Spam': {'a': 0.022727272727272728,
  'amazon': 0.022727272727272728,
  'beauty': 0.045454545454545456,
  'best': 0.045454545454545456,
  'camping': 0.022727272727272728,
  'deal': 0.022727272727272728,
  'discover': 0.045454545454545456,
  'donate': 0.045454545454545456,
  'exclusive': 0.022727272727272728,
  'experience': 0.045454545454545456,
  'for': 0.022727272727272728,
  'forests': 0.045454545454545456,
  'free': 0.022727272727272728,
  'gear': 0.022727272727272728,
  'hiking': 0.045454545454545456,
  'jungle': 0.022727272727272728,
  'of': 0.045454545454545456,
  'purchase': 0.022727272727272728,
  'safari': 0.022727272727272728,
  'save': 0.045454545454545456,
  'the': 0.09090909090909091,
  'to': 0.022727272727272728,
  'today': 0.045454545454545456,
  'trails': 0.045454545454545456,
  'trip': 0.022727272727272728,
  'whales,': 0.045454545454545456,
  'win': 0.022727272727272728,
  'with': 0.022727272727272728,
  'you': 0.022727272727272728},
 'Spam': {'a': 0.04347826086

In [36]:
new_message = "free hiking gear for you"
tokens = tokenize(new_message)

In [37]:
posteriors = {}
for cls in class_counts.keys():
    posterior = priors[cls]
    for token in tokens:
        if token in vocabulary:
            posterior *= word_likelihoods[cls][token]
    posteriors[cls] = posterior

In [38]:
print("Posteriors:", posteriors)
print("Predicted class:", max(posteriors, key=posteriors.get))

Posteriors: {'Not Spam': 6.063684795499563e-09, 'Spam': 3.8841932445178874e-08}
Predicted class: Spam



### Step 5: Using Logarithms for Stability

To avoid numerical underflow, we calculate probabilities in log-space. Instead of multiplying probabilities, we add their logarithms.

For example:

$$
\log P(\text{Spam} \mid \text{Message}) = \log P(\text{Spam}) + \sum_{\text{Word} \in \text{Message}} \log P(\text{Word} \mid \text{Spam})
$$

Similarly for Not Spam:

$$
\log P(\text{Not Spam} \mid \text{Message}) = \log P(\text{Not Spam}) + \sum_{\text{Word} \in \text{Message}} \log P(\text{Word} \mid \text{Not Spam})
$$

We use this approach for classification.


In [39]:
vocabulary = set()
class_word_counts = defaultdict(Counter)
class_counts = Counter()

for message, label in data:
    tokens = tokenize(message)
    vocabulary.update(tokens)
    class_word_counts[label].update(tokens)
    class_counts[label] += 1

# Calculate priors in log-space
total_messages = sum(class_counts.values())
log_priors = {cls: math.log(count / total_messages) for cls, count in class_counts.items()}

# Laplace smoothed likelihoods in log-space
log_word_likelihoods = {}
for cls, word_count in class_word_counts.items():
    total_words_in_class = sum(word_count.values())
    log_word_likelihoods[cls] = {
        word: math.log((word_count[word] + 1) / (total_words_in_class + vocab_size))
        for word in vocabulary
    }

In [40]:
new_message = "free hiking gear for you"
tokens = tokenize(new_message)

log_posteriors = {}
for cls in class_counts.keys():
    log_posterior = log_priors[cls]
    for token in tokens:
        if token in vocabulary:
            log_posterior += log_word_likelihoods[cls][token]
    log_posteriors[cls] = log_posterior

print("Log-Posteriors:", log_posteriors)
print("Predicted class:", max(log_posteriors, key=log_posteriors.get))

Log-Posteriors: {'Not Spam': -18.920948169591306, 'Spam': -17.06376544076564}
Predicted class: Spam


In [42]:
df = pd.read_csv('../datasets/IMDB Dataset.csv')

In [43]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [44]:
list_of_reviews = df.values.tolist()

In [45]:
vocabulary = set()
class_word_counts = defaultdict(Counter)
class_counts = Counter()

for message, label in list_of_reviews:
    tokens = tokenize(message)
    vocabulary.update(tokens)
    class_word_counts[label].update(tokens)
    class_counts[label] += 1

total_messages = sum(class_counts.values())
log_priors = {cls: math.log(count / total_messages) for cls, count in class_counts.items()}

log_word_likelihoods = {}
for cls, word_count in class_word_counts.items():
    total_words_in_class = sum(word_count.values())
    log_word_likelihoods[cls] = {
        word: math.log((word_count[word] + 1) / (total_words_in_class + vocab_size))
        for word in vocabulary
    }

In [46]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [47]:
df['review'][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [48]:
new_message = df['review'][3]
tokens = tokenize(new_message)

log_posteriors = {}
for cls in class_counts.keys():
    log_posterior = log_priors[cls]
    for token in tokens:
        if token in vocabulary:
            log_posterior += log_word_likelihoods[cls][token]
    log_posteriors[cls] = log_posterior

In [49]:
print("Log-Posteriors:", log_posteriors)
print("Predicted class:", max(log_posteriors, key=log_posteriors.get))

Log-Posteriors: {'positive': -1012.5910593801633, 'negative': -998.9412189588429}
Predicted class: negative


In [72]:
def classify_sentiment():
    review = input()
    tokenized_redview = tokenize(review)

    log_posteriors = {}
    for cls in class_counts.keys():
        log_posterior = log_priors[cls]
        for token in tokenized_redview:
            if token in vocabulary:
                log_posterior += log_word_likelihoods[cls][token]
        log_posteriors[cls] = log_posterior
    print(log_posteriors)
    return max(log_posteriors, key=log_posteriors.get)
    

In [73]:
classify_sentiment()

 its okay, not the best I've seen, but quite good


{'positive': -68.72435528015218, 'negative': -68.58871296916749}


'negative'