**Spam Detection Using the Unigram Model with Hugging Face**

In [6]:
import pandas as pd
from collections import Counter
from transformers import AutoTokenizer

In [7]:
def get_tokenizer(model_name="bert-base-uncased"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return tokenizer

In [8]:
def build_vocabulary(corpus):
    words = []
    for sentence in corpus:
        words.extend(sentence.split())  # Tokenize based on spaces
    return Counter(words)

In [9]:
def calculate_word_probabilities(word_counts, total_words):
    return {word: count / total_words for word, count in word_counts.items()}

In [10]:
def tokenize_sentence(tokenizer, sentence):
    return tokenizer.tokenize(sentence)

In [11]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data['text'].tolist(), data['label'].tolist()  # Text and labels

In [12]:
def calculate_sentence_probability(sentence, word_probabilities):
    words = sentence.split()  # Tokenize the sentence into words
    sentence_probability = 1.0
    for word in words:
        word_prob = word_probabilities.get(word.lower(), 0)  # Default to 0 if word is OOV
        sentence_probability *= word_prob
    return sentence_probability


In [13]:
def classify_message(sentence, word_probabilities, threshold=0.01):
    sentence_prob = calculate_sentence_probability(sentence, word_probabilities)
    if sentence_prob > threshold:
        return "Spam"
    else:
        return "Ham"

In [14]:
def train_unigram_model(training_file):
    # Load training data
    texts, labels = load_data(training_file)

    # Build vocabulary and calculate word probabilities
    word_counts = build_vocabulary(texts)
    total_words = sum(word_counts.values())
    word_probabilities = calculate_word_probabilities(word_counts, total_words)

    return word_probabilities

In [15]:
def test_model(test_file, word_probabilities):
    texts, labels = load_data(test_file)

    correct = 0
    total = len(texts)

    for text, true_label in zip(texts, labels):
        predicted_label = classify_message(text, word_probabilities)
        if predicted_label == "Spam" and true_label == 1:
            correct += 1
        elif predicted_label == "Ham" and true_label == 0:
            correct += 1

    accuracy = correct / total
    print(f"Accuracy: {accuracy * 100:.2f}%")

In [17]:
train_file = "data/spam_detection_train.csv"
test_file = "data/spam_detection_test.csv"
word_probabilities = train_unigram_model(train_file)
test_model(test_file, word_probabilities)

Accuracy: 50.00%
