# 🤗 Emotion Classification

### Import Libraries

In [18]:
# 🤗 huggingface
from datasets import load_dataset
from transformers import AutoModel, AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

# pytorch
import torch

# numpy
import numpy as np

# sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, confusion_matrix

##### Some Default Config

In [2]:
DISTILBERT_CKPT = 'distilbert-base-uncased'
TRAINING_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01

### Dataset Preparation

In [3]:
emotions = load_dataset('emotion')
emotions

Using custom data configuration default
Reusing dataset emotion (C:\Users\fahrizain\.cache\huggingface\datasets\emotion\default\0.0.0\348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [4]:
train_ds = emotions['train']
train_ds[0]

{'text': 'i didnt feel humiliated', 'label': 0}

In [5]:
train_ds.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=6, names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}

#### Tokenization

In [6]:
tokenizer = AutoTokenizer.from_pretrained(DISTILBERT_CKPT)

In [7]:
# get to know tokenizer properties
print(tokenizer.vocab_size)
print(tokenizer.model_max_length)
print(tokenizer.model_input_names)

30522
512
['input_ids', 'attention_mask']


In [8]:
def tokenize_data(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

In [9]:
emotions_tokenized = emotions.map(tokenize_data, batched=True, batch_size=None)
emotions_tokenized



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

### Modelling

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#### Transformers as Feature Extractors
Since we are using transformers model as feature extractor only, we will forward pass tokenized input to model. The model itself will give us hidden states with shape `[batch_size, n_tokens, hidden_dim]`. And finally, we will extract the last hidden state and use it as our extracted feature.

In [None]:
model = AutoModel.from_pretrained(DISTILBERT_CKPT).to(device)

In [45]:
def extract_hidden_states(batch):  
    # make sure we put the input on the same device w model
    inputs = {k:v.to(device) for k, v in batch.items() 
                if k in tokenizer.model_input_names}
    # extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    
    # return vector for [CLS] token
    return {'hidden_state': last_hidden_state[:, 0].cpu().numpy()}

In [51]:
emotions_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# apply hidden state extractor function
emotions_hidden = emotions_tokenized.map(extract_hidden_states, batched=True, batch_size=32)

  0%|          | 0/500 [00:00<?, ?ba/s]

  0%|          | 0/63 [00:00<?, ?ba/s]

  0%|          | 0/63 [00:00<?, ?ba/s]

In [52]:
emotions_hidden

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', 'hidden_state'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', 'hidden_state'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask', 'hidden_state'],
        num_rows: 2000
    })
})

##### Create Feature Matrix

In [57]:
X_train = np.array(emotions_hidden['train']['hidden_state'])
X_valid = np.array(emotions_hidden['validation']['hidden_state'])
y_train = np.array(emotions_hidden['train']['label'])
y_valid = np.array(emotions_hidden['validation']['label'])

X_train.shape, X_valid.shape

((16000, 768), (2000, 768))

In [58]:
lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(X_train, y_train)
lr_clf.score(X_valid, y_valid)

0.6335

Surprisingly, our Logistic Regression model achieve 0.633 accuracy which is not quite bad! Next, let's fine-tune our DistilBERT model

#### Fine-Tuning Transformers
Now, we are going to fine-tune our DistilBERT model. Note that here we use `AutoModelForSequenceClassification` instead of `AutoModel`. The difference is now we are loading our model with classification head on top of that.

In [11]:
num_labels = 6
model = AutoModelForSequenceClassification\
            .from_pretrained(DISTILBERT_CKPT, num_labels=num_labels)\
            .to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

In [12]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1}

In [13]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
logging_steps = len(emotions_tokenized['train']) // TRAINING_BATCH_SIZE
model_name = f'{DISTILBERT_CKPT}-finetuned-emotion'
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=LEARNING_RATE,
                                  per_device_train_batch_size=TRAINING_BATCH_SIZE,
                                  per_device_eval_batch_size=TRAINING_BATCH_SIZE,
                                  weight_decay=WEIGHT_DECAY,
                                  evaluation_strategy='epoch',
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True,
                                  log_level='error')

In [15]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_tokenized['train'],
                  eval_dataset=emotions_tokenized['validation'],
                  tokenizer=tokenizer)

trainer.train()

e:\myproject\all-about-rnn\emoji-classification\distilbert-base-uncased-finetuned-emotion is already a clone of https://huggingface.co/affandyfahrizain/distilbert-base-uncased-finetuned-emotion. Make sure you pull the latest changes with `repo.git_pull()`.


  0%|          | 0/4000 [00:00<?, ?it/s]

{'loss': 0.4279, 'learning_rate': 1e-05, 'epoch': 1.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.20582623779773712, 'eval_accuracy': 0.9345, 'eval_f1': 0.9347081057248726, 'eval_runtime': 33.417, 'eval_samples_per_second': 59.85, 'eval_steps_per_second': 7.481, 'epoch': 1.0}
{'loss': 0.1603, 'learning_rate': 0.0, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

{'eval_loss': 0.18577395379543304, 'eval_accuracy': 0.936, 'eval_f1': 0.936054890104025, 'eval_runtime': 30.7681, 'eval_samples_per_second': 65.002, 'eval_steps_per_second': 8.125, 'epoch': 2.0}
{'train_runtime': 3599.2651, 'train_samples_per_second': 8.891, 'train_steps_per_second': 1.111, 'train_loss': 0.29408922576904295, 'epoch': 2.0}


TrainOutput(global_step=4000, training_loss=0.29408922576904295, metrics={'train_runtime': 3599.2651, 'train_samples_per_second': 8.891, 'train_steps_per_second': 1.111, 'train_loss': 0.29408922576904295, 'epoch': 2.0})

In [16]:
preds_output = trainer.predict(emotions_tokenized['validation'])
preds_output.metrics

  0%|          | 0/250 [00:00<?, ?it/s]

{'test_loss': 0.18577395379543304,
 'test_accuracy': 0.936,
 'test_f1': 0.936054890104025,
 'test_runtime': 28.9372,
 'test_samples_per_second': 69.115,
 'test_steps_per_second': 8.639}

#### Saving Model and Push to 🤗Hub

In [21]:
trainer.push_to_hub(commit_message='DistilBERT training completed!')

To https://huggingface.co/affandyfahrizain/distilbert-base-uncased-finetuned-emotion
   dea0066..7fa547b  main -> main

