In [1]:
import torch, evaluate

from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification)

In [2]:
model_id = "dhanishetty/albert-xxlarge-v2-Merged"

In [3]:
id2label = {0: 'Negative', 1:'Neutral', 2:'Positive'}
label2id = {'Negative':0, 'Neutral':1, 'Positive':2}

#generate classification model from model_checkpoints
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels = 3, id2label = id2label, label2id = label2id)
tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)

In [4]:
dataset = load_dataset("mteb/tweet_sentiment_extraction", split="test")

In [5]:
dataset

Dataset({
    features: ['id', 'text', 'label', 'label_text'],
    num_rows: 3534
})

In [6]:
print("Trained model predictions:")
print("----------------------------")
x =0
predictions_list = []
labels_list = []
while x <=30:
    # tokenize text
    inputs = tokenizer.encode(dataset['text'][x], return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)
    predictions_list.append(predictions.item())
    labels_list.append(dataset['label'][x])    
    x = x+1

print(predictions_list)
print(labels_list)

Trained model predictions:
----------------------------
[1, 2, 0, 2, 2, 2, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 2, 1, 0, 1, 0, 1, 0, 2, 0, 1, 1, 0, 1]
[1, 2, 0, 2, 2, 2, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 1, 0, 2, 1, 1, 2, 0, 1]


## Accuracy

In [7]:
accuracy_metric = evaluate.load("accuracy")
results = accuracy_metric.compute(references=labels_list, predictions=predictions_list)
print(f"Accuarcy of the model is {results["accuracy"]}")

Accuarcy of the model is 0.8709677419354839


## F1 Metric

In [8]:
f1_metric = evaluate.load("f1")
#A multiclass example, with different values for the `average` input.
f1_macro = f1_metric.compute(predictions=predictions_list, references=labels_list, average="macro")
print(round(f1_macro['f1'], 2))

f1_micro = f1_metric.compute(predictions=predictions_list, references=labels_list, average="micro")
print(round(f1_micro['f1'], 2))

f1_weighted = f1_metric.compute(predictions=predictions_list, references=labels_list, average="weighted")
print(round(f1_weighted['f1'], 2))

results = f1_metric.compute(predictions=predictions_list, references=labels_list, average=None)
print(results)

0.87
0.87
0.87
{'f1': array([0.89655172, 0.8       , 0.92307692])}


## Precision

In [9]:
precision_metric = evaluate.load("precision")
precision_macro = precision_metric.compute(predictions=predictions_list, references=labels_list, average='macro')
print(f"precision_macro score is {precision_macro["precision"]}")

precision_micro = precision_metric.compute(predictions=predictions_list, references=labels_list, average='micro')
print(f"precision_micro score is {precision_micro["precision"]}")

precision_weighted = precision_metric.compute(predictions=predictions_list, references=labels_list, average='weighted')
print(f"precision_weighted score is {precision_weighted["precision"]}")

precision_none = precision_metric.compute(predictions=predictions_list, references=labels_list, average=None)
print(f"precision_weighted score is {precision_none}")

precision_macro score is 0.888888888888889
precision_micro score is 0.8709677419354839
precision_weighted score is 0.875268817204301
precision_weighted score is {'precision': array([0.86666667, 0.8       , 1.        ])}


## Recall

In [10]:
recall_metric = evaluate.load('recall')
recall_macro = recall_metric.compute(predictions=predictions_list, references=labels_list, average='macro')
print(f"recall_macro score is {recall_macro["recall"]}")

recall_micro = recall_metric.compute(predictions=predictions_list, references=labels_list, average='micro')
print(f"recall_micro score is {recall_micro["recall"]}")

recall_weighted = recall_metric.compute(predictions=predictions_list, references=labels_list, average='weighted')
print(f"recall_weighted score is {recall_weighted["recall"]}")

recall_none = recall_metric.compute(predictions=predictions_list, references=labels_list, average=None)
print(f"recall_None score is {recall_none}")

recall_macro score is 0.861904761904762
recall_micro score is 0.8709677419354839
recall_weighted score is 0.8709677419354839
recall_None score is {'recall': array([0.92857143, 0.8       , 0.85714286])}
