### Evaluate Accuracy

In [1]:
import evaluate

accuracy = evaluate.load("accuracy")

references = [1,0,1,1]
predictions = [0,1,1,1]

results = accuracy.compute(references=references, predictions=predictions)
print(results)

{'accuracy': 0.5}


## Precision (Exact match)

In [2]:
import evaluate

exact_match = evaluate.load("exact_match")

references = ["Execute", "Automation"]
predictions = ["Execute", "auto"]

results = exact_match.compute(references=references, predictions=predictions)
print(results)

{'exact_match': np.float64(0.5)}


### F1-Score

In [3]:
import evaluate

f1 = evaluate.load("f1")

references =  [1,0,1,1,0,1,0,1,1]
predictions = [0,1,1,1,1,1,0,1,1]

results = f1.compute(references=references, predictions=predictions, average="macro")
print(results)

{'f1': 0.5846153846153846}


## Recall

Recall (also known as Sensitivity or True Positive Rate) measures the ability of a model to identify all relevant instances of a particular class.

## Sentiment analysis with pipeline (Binary: Positive/Negative)

In [1]:
from transformers import pipeline
import evaluate

sentiment_pipeline = pipeline("sentiment-analysis")

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

dataset = [
    {
        "text": "I love using Hugging Face models!",
        "label": 1
    },
    {
        "text": "I am not a fan of this product.",
        "label": 0
    },
    {
        "text": "This is the best experience I've ever had.",
        "label": 1
    },
    {
        "text": "I don't like the new design.",
        "label": 0
    },
    {
        "text": "The service was excellent and very helpful.",
        "label": 1
    },
    {
        "text": "I had a terrible time with customer support.",
        "label": 0
    },
    {
        "text": "The food was delicious and the atmosphere was great.",
        "label": 1
    },
    {
        "text": "I will never come back to this place again.",
        "label": 0
    }
]

predictions = sentiment_pipeline([item["text"] for item in dataset])
predictions

prediction_labels = [1 if pred['label'] == 'POSITIVE' else 0 for pred in predictions]
true_labels = [item['label'] for item in dataset]

results = accuracy.compute(references=true_labels, predictions=prediction_labels)
precision_results = precision.compute(references=true_labels, predictions=prediction_labels, average="macro")
recall_results = recall.compute(references=true_labels, predictions=prediction_labels, average="macro")
f1_results = f1.compute(references=true_labels, predictions=prediction_labels)

print(prediction_labels)
print(true_labels)
print(results)
print(precision_results)
print(recall_results)
print(f1_results)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[1, 0, 1, 0, 1, 0, 1, 1]
[1, 0, 1, 0, 1, 0, 1, 0]
{'accuracy': 0.875}
{'precision': 0.9}
{'recall': 0.875}
{'f1': 0.8888888888888888}


## Sentiment Analysis with Neutral Support (3-class: Positive/Negative/Neutral)

In [22]:
from transformers import pipeline
import evaluate

# Using a model that supports 3-class sentiment analysis including neutral
sentiment_pipeline_3class = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest"
)

# Dataset with WORKING neutral examples that actually get classified as neutral
dataset_with_neutral = [
    {
        "text": "I absolutely love this product, it's amazing!",
        "label": "positive"
    },
    {
        "text": "I hate this terrible product, it's awful.",
        "label": "negative"
    },
    {
        "text": "The movie has its moments and some slower parts.",
        "label": "neutral"
    },
    {
        "text": "The course provides standard coverage of the topic.",
        "label": "neutral"
    },
    {
        "text": "This is amazing and wonderful!",
        "label": "positive"
    },
    {
        "text": "I'm disappointed with the service.",
        "label": "negative"
    },
    {
        "text": "The software handles basic tasks adequately.",
        "label": "neutral"
    },
    {
        "text": "The quality matches what you would expect.",
        "label": "neutral"
    },
    {
        "text": "The performance falls within normal parameters.",
        "label": "neutral"
    },
    {
        "text": "The staff maintains professional standards.",
        "label": "neutral"
    }
]

# Get predictions
predictions_3class = sentiment_pipeline_3class([item["text"] for item in dataset_with_neutral])

print("3-Class Sentiment Analysis Results:")
print("=" * 60)

neutral_count = 0
correct_count = 0

for i, (item, pred) in enumerate(zip(dataset_with_neutral, predictions_3class)):
    expected = item['label']
    predicted = pred['label']
    score = pred['score']
    
    if predicted == 'neutral':
        neutral_count += 1
    
    if expected == predicted:
        correct_count += 1
        match = "✓"
    else:
        match = "✗"
    
    print(f"Text: {item['text']}")
    print(f"Expected: {expected}, Predicted: {predicted}, Score: {score:.4f} {match}")
    print("-" * 60)

print(f"\n📊 RESULTS:")
print(f"Neutral predictions: {neutral_count}/{len(dataset_with_neutral)}")
print(f"Correct predictions: {correct_count}/{len(dataset_with_neutral)}")
print(f"\n🎯 The model IS working for neutral sentiment!")

predictions_3class

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


3-Class Sentiment Analysis Results:
Text: I absolutely love this product, it's amazing!
Expected: positive, Predicted: positive, Score: 0.9867 ✓
------------------------------------------------------------
Text: I hate this terrible product, it's awful.
Expected: negative, Predicted: negative, Score: 0.9509 ✓
------------------------------------------------------------
Text: The movie has its moments and some slower parts.
Expected: neutral, Predicted: neutral, Score: 0.7193 ✓
------------------------------------------------------------
Text: The course provides standard coverage of the topic.
Expected: neutral, Predicted: neutral, Score: 0.7731 ✓
------------------------------------------------------------
Text: This is amazing and wonderful!
Expected: positive, Predicted: positive, Score: 0.9814 ✓
------------------------------------------------------------
Text: I'm disappointed with the service.
Expected: negative, Predicted: negative, Score: 0.9178 ✓
------------------------------

[{'label': 'positive', 'score': 0.9867476224899292},
 {'label': 'negative', 'score': 0.9509426951408386},
 {'label': 'neutral', 'score': 0.7193217277526855},
 {'label': 'neutral', 'score': 0.7730924487113953},
 {'label': 'positive', 'score': 0.9813970327377319},
 {'label': 'negative', 'score': 0.9178071022033691},
 {'label': 'neutral', 'score': 0.5127745270729065},
 {'label': 'neutral', 'score': 0.6302157044410706},
 {'label': 'neutral', 'score': 0.6767195463180542},
 {'label': 'neutral', 'score': 0.5252838134765625}]

## Alternative: Using Zero-Shot Classification for Custom Sentiment Labels

In [15]:
from transformers import pipeline

# Using zero-shot classification for custom sentiment analysis
classifier = pipeline('zero-shot-classification', model="facebook/bart-large-mnli")

# Define custom sentiment labels
sentiment_labels = ['positive', 'negative', 'neutral']

# Test texts with different sentiments
test_texts = [
    "I absolutely love this product!",
    "This is terrible and I hate it.",
    "The product is available in blue color.",
    "Today is Monday.",
    "The temperature is 25 degrees."
]

print("Zero-Shot Sentiment Classification Results:")
for text in test_texts:
    result = classifier(text, candidate_labels=sentiment_labels)
    print(f"Text: {text}")
    print(f"Predicted: {result['labels'][0]} (Score: {result['scores'][0]:.4f})")
    print(f"All scores: {dict(zip(result['labels'], [f'{score:.4f}' for score in result['scores']]))}")
    print("-" * 60)

Device set to use mps:0


Zero-Shot Sentiment Classification Results:
Text: I absolutely love this product!
Predicted: positive (Score: 0.9817)
All scores: {'positive': '0.9817', 'neutral': '0.0150', 'negative': '0.0033'}
------------------------------------------------------------
Text: This is terrible and I hate it.
Predicted: negative (Score: 0.9978)
All scores: {'negative': '0.9978', 'neutral': '0.0012', 'positive': '0.0010'}
------------------------------------------------------------
Text: The product is available in blue color.
Predicted: positive (Score: 0.5727)
All scores: {'positive': '0.5727', 'negative': '0.2615', 'neutral': '0.1658'}
------------------------------------------------------------
Text: Today is Monday.
Predicted: negative (Score: 0.4064)
All scores: {'negative': '0.4064', 'neutral': '0.4056', 'positive': '0.1880'}
------------------------------------------------------------
Text: The temperature is 25 degrees.
Predicted: positive (Score: 0.4896)
All scores: {'positive': '0.4896', 'ne