# Imports

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load The Data, Read and map the scores to their classes

In [2]:
dataset = load_dataset('sst', trust_remote_code=True)

def score_to_class(score):
    if score <= 0.2:
        return 0
    elif score <= 0.4:
        return 1
    elif score <= 0.6:
        return 2
    elif score <= 0.8:
        return 3
    else:
        return 4


train_data = dataset['train'].to_pandas()[['sentence', 'label']]
train_data['sentence'] = train_data['sentence'].apply(lambda x: x.lower())
train_data['label'] = train_data['label'].apply(lambda x: score_to_class(x))

test_data = dataset['test'].to_pandas()[['sentence', 'label']]
test_data['sentence'] = test_data['sentence'].apply(lambda x: x.lower())
test_data['label'] = test_data['label'].apply(lambda x: score_to_class(x))

val_data = dataset['validation'].to_pandas()[['sentence', 'label']]
val_data['sentence'] = val_data['sentence'].apply(lambda x: x.lower())
val_data['label'] = val_data['label'].apply(lambda x: score_to_class(x))

np.sum(train_data['label'] == 2)

np.int64(1624)

# Part 1: Naïve Bayes

In [3]:
def train_naive_bayes(data: pd.DataFrame):
    # Get the number of docs and classes of the data
    n = len(data)
    classes = data['label'].unique().tolist()

    # Construct vocabulary, and count of each word and total word count in each class
    vocab = set()
    vocab_counts = {}
    class_word_counts = {}
    # Loop over each sentence-class pair
    for sentence, c in zip(data['sentence'].tolist(), data['label'].tolist()):
        for word in sentence.split():  # Loop over each word
            vocab_counts[(word, c)] = vocab_counts.get((word, c), 0) + 1  # Increment count of word in that class
            class_word_counts[c] = class_word_counts.get(c, 0) + 1  # Increment number of words in that class
            vocab.add(word)  # Add word to the vocabulary set

    # Calculate log versions of prior probabilities p(c) and posterior probabilities for each word p(w|c)
    log_prior = {}
    log_likelihood = {}
    for c in classes:
        log_prior[c] = np.log(np.sum(data['label'] == c) / n)
        for word in vocab:
            log_likelihood[(word, c)] = np.log(
                (vocab_counts.get((word, c), 0) + 1) / (class_word_counts[c] + len(vocab)))

    return log_prior, log_likelihood, vocab


def test_naive_bayes(test_doc: str, log_prior: dict, log_likelihood: dict, classes: list[int], vocab: set):
    # Initialize all scores to 0
    score = [0] * len(classes)
    # Loop over each class and calculate the score of this class given the test doc
    for c_i, c in enumerate(classes):
        score[c_i] = log_prior[c]  # Initialize score to log prior probability
        # Add likelihood of each word in the sentence given current class (if it was in the vocabulary)
        for word in test_doc.lower().split():
            if word in vocab:
                score[c_i] += log_likelihood[(word, c)]
    # Return class with the max score
    return classes[np.argmax(score)]

# Computing Naïve Bayes Accuracy

In [4]:
log_prior, log_likelihood, vocab = train_naive_bayes(train_data)

classes = train_data['label'].unique().tolist()

correct_predictions = 0
total_predictions = len(test_data)

for sentence, true_label in zip(test_data['sentence'], test_data['label']):
    predicted_label = test_naive_bayes(sentence, log_prior, log_likelihood, classes, vocab)
    if predicted_label == true_label:
        correct_predictions += 1

accuracy = correct_predictions / total_predictions
print(f"Accuracy of Naive Bayes classifier: {accuracy:.12f}")

Accuracy of Naive Bayes classifier: 0.401357466063


# Computing Scikit Accuracy

In [5]:
# Prepare training and test datasets (sentence and label)
X_train, y_train = train_data['sentence'], train_data['label']
X_test, y_test = test_data['sentence'], test_data['label']

# Create a pipeline with CountVectorizer and MultinomialNB
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())  # Use alpha=1 for Laplace smoothing, same as in the custom model
])

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy (scikit-learn): {accuracy:.12f}%")

Accuracy (scikit-learn): 0.409049773756%
