## Prerequisites 

In [5]:
#pip install transformers datasets torch scikit-learn huggingface_hub langdetect accelerate tf-keras

import torch
import pandas as pd
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:", device)


Running on: cpu


## Dataset

In [6]:
from datasets import load_dataset, DatasetDict, Dataset

dataset = load_dataset("Sp1786/multiclass-sentiment-analysis-dataset")

train_ds = dataset["train"]
val_ds   = dataset["validation"]
test_ds  = dataset["test"]

#DOWNSAMPLE DATASET FOR FASTER EXPERIMENTATION
# train_df = pd.DataFrame(dataset["train"])
# val_df   = pd.DataFrame(dataset["validation"])
# test_df  = pd.DataFrame(dataset["test"])

# # Reduce size due to resource constraints
# train_size = 20000
# val_size   = 5000
# test_size  = 5000

# def reduce_sample(df, n, seed=42):
#     return df.groupby("label", group_keys=False).apply(
#         lambda x: x.sample(frac=n/len(df), random_state=seed)
#     ).reset_index(drop=True).sample(frac=1, random_state=seed)

# def reduce_sample_balanced(df, n_per_class, seed=42):
#     return pd.concat([
#         df[df['label'] == l].sample(n=n_per_class//len(df['label'].unique()), random_state=seed)
#         for l in df['label'].unique()
#     ]).sample(frac=1, random_state=seed).reset_index(drop=True)

#train_small = reduce_sample(train_df, train_size)
#val_small   = reduce_sample(val_df, val_size)
#test_small  = reduce_sample(test_df, test_size)

# train_small = reduce_sample_balanced(train_df, train_size)
# val_small   = reduce_sample_balanced(val_df, val_size)
# test_small  = reduce_sample_balanced(test_df, test_size)

# train_ds = Dataset.from_pandas(train_small,preserve_index=False)
# val_ds   = Dataset.from_pandas(val_small,preserve_index=False)
# test_ds  = Dataset.from_pandas(test_small,preserve_index=False)

# dataset_reduced = DatasetDict({
#     "train": train_ds,
#     "validation": val_ds,
#     "test": test_ds
# })

In [7]:
for i in range(5):
    print(train_ds[i])
    print("---")


{'id': 9536, 'text': 'Cooking microwave pizzas, yummy', 'label': 2, 'sentiment': 'positive'}
---
{'id': 6135, 'text': 'Any plans of allowing sub tasks to show up in the widget?', 'label': 1, 'sentiment': 'neutral'}
---
{'id': 17697, 'text': " I love the humor, I just reworded it. Like saying 'group therapy' instead`a 'gang banging'. Keeps my moms off my back.   Hahaha", 'label': 2, 'sentiment': 'positive'}
---
{'id': 14182, 'text': ' naw idk what ur talkin about', 'label': 1, 'sentiment': 'neutral'}
---
{'id': 17840, 'text': ' That sucks to hear. I hate days like that', 'label': 0, 'sentiment': 'negative'}
---


In [8]:
from collections import Counter

print("Train class counts:")
print(Counter(train_ds['label']))

print("\nValidation class counts:")
print(Counter(val_ds['label']))

print("\nTest class counts:")
print(Counter(test_ds['label']))

Train class counts:
Counter({1: 11649, 2: 10478, 0: 9105})

Validation class counts:
Counter({1: 1928, 2: 1760, 0: 1517})

Test class counts:
Counter({1: 1930, 2: 1730, 0: 1546})


## Preprocessing

In [9]:
import re
from langdetect import detect, DetectorFactory

def reduce_lengthening(text: str) -> str:
    # "looooove" -> "loove"
    return re.sub(r'(.)\1{2,}', r'\1\1', text)

def clean_tweet(text: str, remove_hashtags=False) -> str:
    text = str(text)
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)
    if remove_hashtags:
        text = re.sub(r"#\w+", " ", text)
    text = re.sub(r"[^0-9A-Za-z\s\.\,\!\?\:\;\-\'\"]", " ", text)
    text = reduce_lengthening(text)
    text = re.sub(r"\s+", " ", text).strip()
    
    try:
        lang = detect(text)
    except:
        lang = "unknown"

    if lang != "en":
        return ""
    return text


def apply_cleaning(ds: Dataset) -> Dataset:
    ds = ds.map(lambda ex: {"clean_text": clean_tweet(ex["text"])}, batched=False)
    ds = ds.filter(lambda ex: ex["clean_text"] != "")
    return ds

train_ds = apply_cleaning(train_ds)
val_ds   = apply_cleaning(val_ds)
test_ds  = apply_cleaning(test_ds)

Map: 100%|██████████| 31232/31232 [01:06<00:00, 472.61 examples/s]
Filter: 100%|██████████| 31232/31232 [00:00<00:00, 171584.24 examples/s]
Map: 100%|██████████| 5205/5205 [00:11<00:00, 465.12 examples/s]
Filter: 100%|██████████| 5205/5205 [00:00<00:00, 140510.21 examples/s]
Map: 100%|██████████| 5206/5206 [00:11<00:00, 462.15 examples/s]
Filter: 100%|██████████| 5206/5206 [00:00<00:00, 116276.41 examples/s]


In [10]:
for i in range(5):
    print(train_ds[i])
    print("---")

{'id': 9536, 'text': 'Cooking microwave pizzas, yummy', 'label': 2, 'sentiment': 'positive', 'clean_text': 'cooking microwave pizzas, yummy'}
---
{'id': 6135, 'text': 'Any plans of allowing sub tasks to show up in the widget?', 'label': 1, 'sentiment': 'neutral', 'clean_text': 'any plans of allowing sub tasks to show up in the widget?'}
---
{'id': 17697, 'text': " I love the humor, I just reworded it. Like saying 'group therapy' instead`a 'gang banging'. Keeps my moms off my back.   Hahaha", 'label': 2, 'sentiment': 'positive', 'clean_text': "i love the humor, i just reworded it. like saying 'group therapy' instead a 'gang banging'. keeps my moms off my back. hahaha"}
---
{'id': 14182, 'text': ' naw idk what ur talkin about', 'label': 1, 'sentiment': 'neutral', 'clean_text': 'naw idk what ur talkin about'}
---
{'id': 17840, 'text': ' That sucks to hear. I hate days like that', 'label': 0, 'sentiment': 'negative', 'clean_text': 'that sucks to hear. i hate days like that'}
---


## Tokenisation and Padding

In [11]:
from transformers import AutoTokenizer

MODEL_NAME = "distilbert-base-uncased"
MAX_LENGTH = 64 

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch["clean_text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)

train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)


Map: 100%|██████████| 28657/28657 [00:01<00:00, 18745.25 examples/s]
Map: 100%|██████████| 4772/4772 [00:00<00:00, 20526.35 examples/s]
Map: 100%|██████████| 4779/4779 [00:00<00:00, 22438.97 examples/s]


In [12]:
for i in range(5):
    print(train_ds[i])
    print("---")

{'id': 9536, 'text': 'Cooking microwave pizzas, yummy', 'label': 2, 'sentiment': 'positive', 'clean_text': 'cooking microwave pizzas, yummy', 'input_ids': [101, 8434, 18302, 10733, 2015, 1010, 9805, 18879, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
---
{'id': 6135, 'text': 'Any plans of allowing sub tasks to show up in the widget?', 'label': 1, 'sentiment': 'neutral', 'clean_text': 'any plans of allowing sub tasks to show up in the widget?', 'input_ids': [101, 2151, 3488, 1997, 4352, 4942, 8518, 2000, 2265, 2039, 1999, 1996, 15536, 24291, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Load the Model

In [24]:
from transformers import AutoModelForSequenceClassification

MODEL_NAME = "distilbert-base-uncased"
NUM_LABELS = 3

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Trainer Args

In [25]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./sentiment_model",
    eval_strategy="epoch",      
    save_strategy="epoch",            
    learning_rate=2e-5,                
    per_device_train_batch_size=8,
    num_train_epochs=5,                
    weight_decay=0.01,                
    logging_dir="./logs",
    save_total_limit=2,                
    load_best_model_at_end=True,      
    metric_for_best_model="f1",      
    greater_is_better=True,           
    warmup_steps=500,                 
    lr_scheduler_type="linear"    
)      

## Train the Model

In [None]:
from transformers import Trainer, EarlyStoppingCallback
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train() 

Epoch,Training Loss,Validation Loss
1,0.6134,0.61567


early stopping required metric_for_best_model, but did not find eval_f1 so early stopping is disabled


KeyError: "The `metric_for_best_model` training argument is set to 'eval_f1', which is not found in the evaluation metrics. The available evaluation metrics are: ['eval_loss']. Consider changing the `metric_for_best_model` via the TrainingArguments."

## Eval

In [None]:
from sklearn.metrics import classification_report

test_output = trainer.predict(test_ds)
print(test_output.metrics)

preds = np.argmax(test_output.predictions, axis=1)
labels = test_output.label_ids
print(classification_report(labels, preds, labels=[0,1,2], target_names=["negative","neutral","positive"]))




{'test_loss': 0.8097283244132996, 'test_runtime': 105.9566, 'test_samples_per_second': 64.696, 'test_steps_per_second': 8.088}
              precision    recall  f1-score   support

    negative       0.81      0.81      0.81      3432
     neutral       0.00      0.00      0.00         0
    positive       0.82      0.79      0.81      3423

    accuracy                           0.80      6855
   macro avg       0.55      0.53      0.54      6855
weighted avg       0.82      0.80      0.81      6855



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## Save Model

In [None]:
import os

SAVE_DIR = "./sentiment_model"
os.makedirs(SAVE_DIR, exist_ok=True)
trainer.save_model(SAVE_DIR)             
tokenizer.save_pretrained(SAVE_DIR)
print("Model & tokenizer saved to", SAVE_DIR)


Model & tokenizer saved to ./sentiment_model


## Playground

In [None]:

import torch
import torch.nn.functional as F

inputs = [
    "im feeling bad",        # Negative sentiment
    "neutral",               # Neutral sentiment
    "I absolutely love this" # Positive sentiment
]

# Tokenize
tokenized_inputs = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")

outputs = model(**tokenized_inputs)

logits = outputs.logits

predictions = torch.argmax(logits, axis=1)

probs = F.softmax(logits, dim=-1)

confidences = torch.max(probs, axis=1).values

print("Predictions:", predictions)
print("Probabilities:\n", probs)
print("Confidences:", confidences)


Predictions: tensor([0, 2, 2])
