# Sentiment Analysis  of text documents using TbNB

This is an example showing how TbNB can be used to classify documents by sentiment using a Bag of Words approach. This demo uses a binary document-term sparse matrix to encode the features and demonstrates the correct procedure to correctly train and utilize a (iterative) Threshold-Based Naive Bayes model

#### Setup

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from models.tbnb import TbNB
import numpy as np 
from datasets import load_dataset
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.pipeline import Pipeline
import time

ImportError: cannot import name 'load_dataset' from 'datasets' (unknown location)

### Import data and perform train/test split

Here we'll employ a simple sentiment dataset containing various reviews. We split the dataset in training and test data (counting 25k samples each), and according to dependent and independent variables.

In [None]:
dataset = load_dataset("yelp_polarity")
train = dataset["train"].shuffle(seed=42).select(range(100_000))
test = dataset["test"]


In [None]:
X_train_text = train["text"]
y_train = np.array(train["label"])
X_test_text = test["text"]
y_test = np.array(test["label"])


#### Text Vectorization

We leverage scikit-learn's CountVectorizer in order to remove stopwords and to create a BoW matrix signifying word presence/absence within each document. The vectorizer is fitted on training and data and is used to transform both training and test data. As the output indicates, CountVectorizer's output type defaults to a scipy sparse matrix, a format especially fitting for BoW data, which allows for extremely fast computations. However, TbNB also accepts other formats for X, such as numpy.ndarray or pandas dataframe. These formats are converted internally into sparse matrices. 

In [None]:
vectorizer = CountVectorizer(binary=False, stop_words="english")
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

In [None]:
type(X_train)

#### Initialize and train the TbNB Model

We instantiate the model with iterative=True, which means calling fit will automatically estimate class priors and employ the iterative optimization algorithm described in Romano, M., Zammarchi, G., & Conversano, C. (2024). The .fit() method returns the fitted model and can be used for predictions using dot notation.


In [None]:
model = TbNB(iterative=True)
y_pred = model.fit(X_train, y_train).predict(X_test)
print("Predicted labels:", y_pred)

#### Evaluate Performance and post-hoc analysis

We can evaluate the modelâ€™s accuracy and other metrics using standard scikit-learn functions, as well as inspect learned attributed using the TbNB class

In [None]:
print(classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))


### Inspect
 Once the model is trained, one can simply access learned parameters by calling their name using dot notation 

In [None]:
print("Decision threshold (tau_):", model.threshold_)
print("Number of features (words):", model.n_features_in_)
print("Review scores (lambda_scores_):")
print(model.lambda_scores_[:10])


#### Benchmark



In [None]:
results = []

def benchmark_model(name, clf, X_train, X_test, y_train, y_test, variant):
    """Esegue un benchmark e salva i risultati globali."""
    
    t0 = time.time()
    clf.fit(X_train, y_train)
    train_time = time.time() - t0

    t0 = time.time()
    preds = clf.predict(X_test)
    pred_time = time.time() - t0

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average="binary")


    results.append({
        "Vectorizer": variant,
        "Model": name,
        "Accuracy": acc,
        "F1-score": f1,
        "Train Time (s)": train_time,
        "Predict Time (s)": pred_time
    })


vectorizers = {
    "Simple": CountVectorizer(binary=False, stop_words="english"),
    "N_grams": CountVectorizer(binary=False, stop_words="english", ngram_range=(1,2))
}

for variant, vectorizer in vectorizers.items():

    print(f"\n==============================")
    print(f"  Running Vectorizer: {variant}")
    print(f"==============================")

    X_train_vec = vectorizer.fit_transform(X_train_text)
    X_test_vec = vectorizer.transform(X_test_text)

    benchmark_model(
        f"TbNB",
        TbNB(iterative=False),
        X_train_vec,
        X_test_vec,
        y_train,
        y_test,
        variant
    )
    
    benchmark_model(
        f"iTbNB",
        TbNB(iterative=True),
        X_train_vec,
        X_test_vec,
        y_train,
        y_test,
        variant
    )

    benchmark_model(
        "BernoulliNB",
        BernoulliNB(),
        X_train_vec,
        X_test_vec,
        y_train,
        y_test,
        variant
    )

    benchmark_model(
        "MultinomialNB",
        MultinomialNB(),
        X_train_vec,
        X_test_vec,
        y_train,
        y_test,
        variant
    )

    benchmark_model(
        "ComplementNB",
        ComplementNB(),
        X_train_vec,
        X_test_vec,
        y_train,
        y_test,
        variant
    )



df = pd.DataFrame(results).sort_values(by=["Vectorizer", "Accuracy"], ascending=[True, False])
print("\n\n=== RISULTATI FINALI ===")
print(df.to_string(index=False))


In [None]:
import time
from preprocessing.nltk_pipeline import TextPreprocessor
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB

results = []

def benchmark_model(name, pipeline, X_train, X_test, y_train, y_test, variant):
    """Esegue un benchmark e salva i risultati globali."""

    # Train
    t0 = time.time()
    pipeline.fit(X_train, y_train)
    train_time = time.time() - t0

    # Predict
    t0 = time.time()
    preds = pipeline.predict(X_test)
    pred_time = time.time() - t0

    # Metrics
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average="binary")

    results.append({
        "Vectorizer": variant,
        "Model": name,
        "Accuracy": acc,
        "F1-score": f1,
        "Train Time (s)": train_time,
        "Predict Time (s)": pred_time
    })

vectorizers = {
    "Simple": CountVectorizer(binary=False),
    "N_grams": CountVectorizer(binary=False, ngram_range=(1,2)),
}


preprocessor = TextPreprocessor(
    language="english",
    remove_html=False,
    remove_urls=False,
    lower=True,
    expand_contr=True,
    remove_punct=True,
    remove_sw=True,
    stem=True
)

X_train_clean = preprocessor.fit_transform(X_train_text)
X_test_clean  = preprocessor.transform(X_test_text)


for variant, vectorizer in vectorizers.items():

    print(f"\n==============================")
    print(f"  Running Vectorizer: {variant}")
    print(f"==============================")

    X_train_vec = vectorizer.fit_transform(X_train_clean)
    X_test_vec = vectorizer.transform(X_test_clean)

    benchmark_model("TbNB", TbNB(iterative=False),
                    X_train_vec, X_test_vec, y_train, y_test, variant)

    benchmark_model("iTbNB", TbNB(iterative=True),
                    X_train_vec, X_test_vec, y_train, y_test, variant)

    benchmark_model("BernoulliNB", BernoulliNB(),
                    X_train_vec, X_test_vec, y_train, y_test, variant)

    benchmark_model("MultinomialNB", MultinomialNB(),
                    X_train_vec, X_test_vec, y_train, y_test, variant)

    benchmark_model("ComplementNB", ComplementNB(),
                    X_train_vec, X_test_vec, y_train, y_test, variant)



df = pd.DataFrame(results).sort_values(by=["Vectorizer", "Accuracy"], ascending=[True, False])
print("\n\n=== RISULTATI FINALI ===")
print(df.to_string(index=False))
