In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from TbNB import TbNB  
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from preprocessing.nltk_pipeline import TextPreprocessor
from utils.benchmarking import run_experiment, evaluate_model
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
import pandas as pd


## Data Loading

We can easily load data by using the dataset package, which allows us to directly connect with HugginFace repository. The enelpol/booking_com_reviews contains 516k samples and two columns: one for positive comments and one for negative ones (which compound to one single review). Data is therefore united in a single column and then split in training and test sets.

In [2]:
df = load_dataset("enelpol/booking_com_reviews", split="train").to_pandas()

train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42
)

train_pos = pd.DataFrame({"review_text": train_df["Positive_Review"], "label": 1})
train_neg = pd.DataFrame({"review_text": train_df["Negative_Review"], "label": 0})

test_pos = pd.DataFrame({"review_text": test_df["Positive_Review"], "label": 1})
test_neg = pd.DataFrame({"review_text": test_df["Negative_Review"], "label": 0})


train_df = pd.concat([train_pos, train_neg], ignore_index=True)
test_df = pd.concat([test_pos, test_neg], ignore_index=True)
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)



train_reviews = train_df["review_text"].tolist()
test_reviews = test_df["review_text"].tolist()

train_labels = train_df["label"].tolist()
test_labels = test_df["label"].tolist()


# Data Preprocessing

Using the custom-built TextPreprocessor class allows for straightforward and quick data preprocessing. NLP data cleaning operations include: lowercasing, stopwords and punctuation removal, emoji conversion and stemming. Removing superfluous operations might speed up the process

In [3]:
preprocessor = TextPreprocessor(remove_html=False, remove_urls=False)
preprocessor.fit(X = train_reviews)
train_reviews = preprocessor.transform(train_reviews)
test_reviews = preprocessor.transform(test_reviews)

## Data Vectorization

Sklearn-like models only accept already vectorized data, therefore we employ sklearn.CountVectorizer to transform textual data into a BoW matrix. We also consider both single words and n-grams of length 2 to further capture the context within a sentence

In [4]:
vectorizer = CountVectorizer(
    binary=False,
    ngram_range=(1,2)
)

X_train_vec = vectorizer.fit_transform(train_reviews)
X_test_vec  = vectorizer.transform(test_reviews)

results_it = []
res_false = run_experiment(False, X_train_vec, train_labels, X_test_vec, test_labels)
res_true  = run_experiment(True, X_train_vec, train_labels, X_test_vec, test_labels)

print(pd.DataFrame([res_false, res_true]))


   Iterative  Accuracy  F1-score  Train Time (s)  Predict Time (s)  Iterations
0      False  0.929994  0.932033        2.305375          0.034455           0
1       True  0.935607  0.936392        5.731552          0.058477           6


In [5]:
results = []

models = [
    ("TbNB (non-iterative)", TbNB(iterative=False)),
    ("TbNB (iterative)", TbNB(iterative=True)),
    ("MultinomialNB", MultinomialNB()),
    ("BernoulliNB", BernoulliNB())]

for name, model in models:
    results.append(
        evaluate_model(
            model,
            X_train_vec, train_labels,
            X_test_vec, test_labels,
            name=name
        )
    )

df_results = pd.DataFrame(results)
print(df_results)


                  Model  Accuracy  F1-score  Train Time (s)  Predict Time (s)
0  TbNB (non-iterative)  0.929994  0.932033        2.682678          0.036560
1      TbNB (iterative)  0.935607  0.936392        4.210131          0.339915
2         MultinomialNB  0.936325  0.937131        0.606347          0.119346
3           BernoulliNB  0.933067  0.932249        0.543643          0.163990


In [6]:

clf = TbNB(iterative=True)
clf.fit(X_train_vec, train_labels)

for d in clf.decisions_:
    print(
            f"Iter {d.iteration}",
            f"range=({d.start:.3f}, {d.end:.3f}), tau={d.tau:.3f}"
    )


Iter 1 range=(-6.246, 1.998), tau=-1.361
Iter 2 range=(-2.475, -0.247), tau=-1.478
Iter 3 range=(-1.692, -1.264), tau=-1.561
Iter 4 range=(-1.604, -1.518), tau=-1.562
Iter 5 range=(-1.576, -1.549), tau=-1.573
Iter 6 range=(-1.576, -1.570), tau=-1.571


In [7]:
clf.decisions_

[Decision(iteration=1, start=np.float64(-6.246217114233314), end=np.float64(1.9975913158401266), tau=np.float64(-1.3609973038194232), x_max_pos=np.float64(1.8655583479911027), x_max_neg=np.float64(-3.7293386646112925), direction='r'),
 Decision(iteration=2, start=np.float64(-2.4749346804325243), end=np.float64(-0.24705597433169246), tau=np.float64(-1.4780760401651851), x_max_pos=np.float64(-0.8514154631738602), x_max_neg=np.float64(-1.904026823814093), direction='r'),
 Decision(iteration=3, start=np.float64(-1.6918429767233643), end=np.float64(-1.26409030310119), tau=np.float64(-1.5612478161079857), x_max_pos=np.float64(-1.3283174312726878), x_max_neg=np.float64(-1.3227510801644913), direction='l'),
 Decision(iteration=4, start=np.float64(-1.6044822827943053), end=np.float64(-1.5180760485661766), tau=np.float64(-1.5624468174941346), x_max_pos=np.float64(-1.536066535772834), x_max_neg=np.float64(-1.5425534902944653), direction='r'),
 Decision(iteration=5, start=np.float64(-1.57584641823