In [12]:
# !pip install datasets
# !pip install transformers
# !pip install torch torchvision torchaudio

In [13]:
from datasets import load_dataset

data = load_dataset("rotten_tomatoes")

data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [14]:
data["train"][0, -1]

{'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
  'things really get weird , though not particularly scary : the movie is all portent and no content .'],
 'label': [1, 0]}

In [15]:
import torch

print(torch.__version__)
print("MPS available:", torch.backends.mps.is_available())
print("MPS built:", torch.backends.mps.is_built())

2.8.0
MPS available: True
MPS built: True


In [28]:
from transformers import pipeline

model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"

pipe = pipeline(
    model=model_path,
    tokenizer=model_path,
    # return_all_scores=True,
    top_k=None, # top_k=1 <=> return_all_scores=False
    device=0 if torch.backends.mps.is_available() else -1
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


In [93]:
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

# Run inference
y_pred = []

for output in tqdm(pipe(KeyDataset(data["test"], "text")), total=len(data["test"])):
    # print(output)
    positive_score, negative_score = 0, 0
    for item in output:
        if item["label"] == "positive":
            positive_score = item["score"]
        elif item["label"] == "negative":
            negative_score = item["score"]
    assignment = np.argmax([negative_score, positive_score])
    y_pred.append(assignment)

100%|██████████| 1066/1066 [00:27<00:00, 39.05it/s]


In [94]:
y_true = data["test"]["label"]

In [95]:
print(f"Postive reviews: {sum(y_true)} / {len(y_true)} ({100 * sum(y_true) / len(y_true):.2f}%)")
print(f"Postive reviews: {sum(y_pred)} / {len(y_pred)} ({100 * sum(y_pred) / len(y_pred):.2f}%)")

Postive reviews: 533 / 1066 (50.00%)
Postive reviews: 457 / 1066 (42.87%)


In [96]:
import pandas

report1 = pandas.DataFrame([y_true, y_pred], index=["y_true", "y_pred"]).T
report1

Unnamed: 0,y_true,y_pred
0,1,1
1,1,1
2,1,0
3,1,1
4,1,1
...,...,...
1061,0,0
1062,0,0
1063,0,0
1064,0,0


In [103]:
TP = report1[(report1["y_true"] == 1) & (report1["y_pred"] == 1)].shape[0]
FP = report1[(report1["y_true"] == 0) & (report1["y_pred"] == 1)].shape[0]
TN = report1[(report1["y_true"] == 0) & (report1["y_pred"] == 0)].shape[0]
FN = report1[(report1["y_true"] == 1) & (report1["y_pred"] == 0)].shape[0]

pandas.DataFrame([
    [TP, FP],
    [FN, TN],
], index=["pred_1", "pred_0"], columns=["actual_1", "actual_0"])

Unnamed: 0,actual_1,actual_0
pred_1,384,73
pred_0,149,460


In [None]:
# Positivo (1)
TP_pos = TP
FP_pos = FP
TN_pos = TN
FN_pos = FN

precision_pos = TP_pos / (TP_pos + FP_pos)
recall_pos    = TP_pos / (TP_pos + FN_pos)
f1_pos        = 2 * precision_pos * recall_pos / (precision_pos + recall_pos)
support_pos   = TP_pos + FN_pos

# Negativo (0)
TP_neg = TN
FP_neg = FN
TN_neg = TP
FN_neg = FP

precision_neg = TP_neg / (TP_neg + FP_neg)
recall_neg    = TP_neg / (TP_neg + FN_neg)
f1_neg        = 2 * precision_neg * recall_neg / (precision_neg + recall_neg)
support_neg   = TP_neg + FN_neg

# Accuracy
accuracy = (TP_pos + TN_pos) / (TP_pos + TN_pos + FP_pos + FN_pos)

# Macro average
precision_macro = (precision_pos + precision_neg) / 2
recall_macro    = (recall_pos + recall_neg) / 2
f1_macro        = (f1_pos + f1_neg) / 2

# Weighted average
precision_weighted = (precision_pos * support_pos + precision_neg * support_neg) / (support_pos + support_neg)
recall_weighted    = (recall_pos * support_pos + recall_neg * support_neg) / (support_pos + support_neg)
f1_weighted        = (f1_pos * support_pos + f1_neg * support_neg) / (support_pos + support_neg)

report2 = pandas.DataFrame({
    "precision": [precision_neg, precision_pos],
    "recall":    [recall_neg, recall_pos],
    "f1-score":  [f1_neg, f1_pos],
    "support":   [support_neg, support_pos]
}, index=["Negative Review", "Positive Review"])

report2.loc["accuracy"]     = [accuracy, accuracy, accuracy, support_neg + support_pos]
report2.loc["macro avg"]    = [precision_macro, recall_macro, f1_macro, support_neg + support_pos]
report2.loc["weighted avg"] = [precision_weighted, recall_weighted, f1_weighted, support_neg + support_pos]

report2.round(2)

Unnamed: 0,precision,recall,f1-score,support
Negative Review,0.76,0.86,0.81,533.0
Positive Review,0.84,0.72,0.78,533.0
accuracy,0.79,0.79,0.79,1066.0
macro avg,0.8,0.79,0.79,1066.0
weighted avg,0.8,0.79,0.79,1066.0


In [108]:
from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred, target_names=["Negative Review", "Positive Review"]))

                 precision    recall  f1-score   support

Negative Review       0.76      0.86      0.81       533
Positive Review       0.84      0.72      0.78       533

       accuracy                           0.79      1066
      macro avg       0.80      0.79      0.79      1066
   weighted avg       0.80      0.79      0.79      1066

