In [1]:
from datetime import datetime, timedelta

import pandas as pd
import random

import requests

# Load the CSV file into a DataFrame
df = pd.read_csv('review.csv')

In [2]:
data = df.drop(columns=["datetime", "temperature", "lat", "lon"])

In [3]:
# create a simple function to tokenize messages into distinct words
from typing import Set
import re

def tokenize(text: str) -> Set[str]:
    text = text.lower()                         # Convert to lowercase,
    all_words = re.findall("[a-z0-9']+", text)  # extract the words, and
    return set(all_words)                       # remove duplicates.

assert tokenize("Data Science is science") == {"data", "science", "is"}

In [4]:
from typing import NamedTuple

class Review(NamedTuple):
    text: str
    review_status: bool

In [5]:
# Assuming df is your DataFrame
df_values = data.to_dict('records')

# Converting DataFrame to array of Review objects
reviews = [Review(**record) for record in df_values]

In [6]:
# As our classifier needs to keep track of tokens, counts, and labels from the training data, we’ll make it a class.
from typing import List, Tuple, Dict, Iterable
import math
from collections import defaultdict

class NaiveBayesClassifier:
    def __init__(self, k: float = 0.5) -> None:
        self.k = k
        self.tokens: Set[str] = set()
        self.token_good_reviews_count: Dict[str, int] = defaultdict(int)
        self.token_bad_reviews_count: Dict[str, int] = defaultdict(int)    # we refer to nonspam emails as ham emails
        self.good_reviews = self.bad_reviews = 0

    def train(self, reviews: Iterable[Review]) -> None:
        for review in reviews:
            # Increment review counts
            if review.review_status:
                self.good_reviews += 1
            else:
                self.bad_reviews += 1

            # Increment word counts
            for token in tokenize(review.text):
                self.tokens.add(token)
                if review.review_status:
                    self.token_good_reviews_count[token] += 1
                else:
                    self.token_bad_reviews_count[token] += 1
    
    def _probabilities(self, token: str) -> Tuple[float, float]:
        """returns P(token | spam) and P(token | ham)"""
        good = self.token_good_reviews_count[token]
        bad = self.token_bad_reviews_count[token]

        p_token_good = (good + self.k) / (self.good_reviews + 2 * self.k)
        p_token_bad = (bad + self.k) / (self.bad_reviews + 2 * self.k)

        return p_token_good, p_token_bad    
    
    # finally we have the predict function
    def predict(self, text: str) -> float:
        text_tokens = tokenize(text)
        log_prob_if_good = log_prob_if_bad = 0.0

        # Iterate through each word in our vocabulary
        for token in self.tokens:
            prob_if_good, prob_if_bad = self._probabilities(token)

            # If *token* appears in the message,
            # add the log probability of seeing it
            if token in text_tokens:
                log_prob_if_good += math.log(prob_if_good)
                log_prob_if_bad += math.log(prob_if_bad)

            # Otherwise add the log probability of _not_ seeing it,
            # which is log(1 - probability of seeing it)
            else:
                log_prob_if_good += math.log(1.0 - prob_if_good)
                log_prob_if_bad += math.log(1.0 - prob_if_bad)

        prob_if_good = math.exp(log_prob_if_good)
        prob_if_bad = math.exp(log_prob_if_bad)
        return prob_if_good / (prob_if_good + prob_if_bad)

In [67]:
reviews = [Review("spam rules", True),
            Review("ham rules", False),
            Review("hello ham", False)]

model = NaiveBayesClassifier(k=0.5)
model.train(reviews)

In [68]:
assert model.tokens == {"spam", "ham", "rules", "hello"}
assert model.good_reviews == 1
assert model.bad_reviews == 2
assert model.token_good_reviews_count == {"spam": 1, "rules": 1}
assert model.token_bad_reviews_count == {"ham": 2, "hello": 1, "rules": 1}

In [7]:
import random
from typing import TypeVar, List, Tuple
X = TypeVar('X')  # generic type to represent a data point

def split_data(data: List[X], prob: float) -> Tuple[List[X], List[X]]:
    """Split data into fractions [prob, 1 - prob]"""
    data = data[:]                    # Make a shallow copy
    random.shuffle(data)              # because shuffle modifies the list.
    cut = int(len(data) * prob)       # Use prob to find a cutoff
    return data[:cut], data[cut:]     # and split the shuffled list there.

random.seed(0)      # just so you get the same answers as me
train_reviews, test_reviews = split_data(reviews, 0.75)

model2 = NaiveBayesClassifier()
model2.train(train_reviews)

In [9]:
from collections import Counter

predictions = [(review, model2.predict(review.text))
               for review in test_reviews]

# Assume that spam_probability > 0.5 corresponds to spam prediction
# and count the combinations of (actual is_spam, predicted is_spam)
confusion_matrix = Counter((review.review_status, good_probability > 0.5)
                           for review, good_probability in predictions)

print(confusion_matrix)

Counter({(True, True): 38, (False, False): 4})
