In [1]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install accelerate -U

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━

In [2]:
import torch
import numpy as np
import pandas as pd

from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
from datasets import load_dataset

emotions_data = load_dataset("emotion")

Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [4]:
emotions_data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [5]:
checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
def tokenize_function(parameter):
  return tokenizer(parameter["text"], truncation=True)


tokenized_dataset = emotions_data.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [7]:
tokenized_dataset = tokenized_dataset.remove_columns(["text"])

In [8]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [9]:
tokenized_dataset = tokenized_dataset.rename_column("label","labels")

In [10]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [12]:
emotions_data["train"].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint , num_labels=6)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from transformers import TrainingArguments

args = TrainingArguments(
    "test-trainer",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
)

In [20]:
from transformers import Trainer


# def compute_metrics(eval_preds):
#     metric = evaluate.load("glue", "mrpc")
#     logits, labels = eval_preds
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1651,0.206233
2,0.1576,0.23758
3,0.1126,0.287753
4,0.079,0.362625
5,0.0398,0.424211
6,0.0271,0.387291
7,0.0238,0.441091
8,0.0176,0.460274
9,0.0128,0.481873
10,0.0073,0.502126


TrainOutput(global_step=20000, training_loss=0.05918940567970276, metrics={'train_runtime': 2131.4389, 'train_samples_per_second': 75.067, 'train_steps_per_second': 9.383, 'total_flos': 3390788721933216.0, 'train_loss': 0.05918940567970276, 'epoch': 10.0})

In [23]:
predictions = trainer.predict(tokenized_dataset["validation"])

In [24]:
predictions


PredictionOutput(predictions=array([[10.511386  , -2.0943708 , -2.0560052 , -1.8524439 , -1.7788653 ,
        -2.1213005 ],
       [10.583155  , -2.2817895 , -2.3974092 , -1.6013263 , -1.8832893 ,
        -1.6550405 ],
       [-2.9581704 ,  9.855459  ,  0.7294801 , -3.0464153 , -3.1738753 ,
        -2.0916147 ],
       ...,
       [-2.3869395 , 10.250868  , -1.8350575 , -2.3885796 , -2.6442866 ,
        -2.1123161 ],
       [-2.7156203 , 10.262711  , -0.46037456, -2.8368986 , -2.9450924 ,
        -2.2045145 ],
       [-2.4076722 , 10.228808  , -1.9051644 , -2.2691941 , -2.4289787 ,
        -2.184602  ]], dtype=float32), label_ids=array([0, 0, 2, ..., 1, 1, 1]), metrics={'test_loss': 0.5021255016326904, 'test_runtime': 7.745, 'test_samples_per_second': 258.232, 'test_steps_per_second': 32.279})

In [25]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [27]:
print(confusion_matrix(preds, emotions_data["validation"]["label"]))
print(classification_report(preds, emotions_data["validation"]["label"]))

[[534   3   1  10   9   1]
 [  3 666  27   3   0   2]
 [  1  24 150   0   0   0]
 [  4   2   0 257   4   0]
 [  8   4   0   5 189   7]
 [  0   5   0   0  10  71]]
              precision    recall  f1-score   support

           0       0.97      0.96      0.96       558
           1       0.95      0.95      0.95       701
           2       0.84      0.86      0.85       175
           3       0.93      0.96      0.95       267
           4       0.89      0.89      0.89       213
           5       0.88      0.83      0.85        86

    accuracy                           0.93      2000
   macro avg       0.91      0.91      0.91      2000
weighted avg       0.93      0.93      0.93      2000



In [28]:
train_preds = trainer.predict(tokenized_dataset["train"])

In [30]:
import numpy as np

train_preds = np.argmax(train_preds.predictions, axis=-1)

In [31]:
print(confusion_matrix(train_preds, emotions_data["train"]["label"]))
print(classification_report(train_preds, emotions_data["train"]["label"]))

[[4665    0    0    2    1    0]
 [   0 5354    8    0    0    0]
 [   0    7 1296    0    0    0]
 [   1    0    0 2154    1    0]
 [   0    0    0    3 1930    2]
 [   0    1    0    0    5  570]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4668
           1       1.00      1.00      1.00      5362
           2       0.99      0.99      0.99      1303
           3       1.00      1.00      1.00      2156
           4       1.00      1.00      1.00      1935
           5       1.00      0.99      0.99       576

    accuracy                           1.00     16000
   macro avg       1.00      1.00      1.00     16000
weighted avg       1.00      1.00      1.00     16000



In [32]:
test_preds = trainer.predict(tokenized_dataset["test"])

In [33]:
test_preds = np.argmax(test_preds.predictions, axis=-1)

print(confusion_matrix(test_preds, emotions_data["test"]["label"]))
print(classification_report(test_preds, emotions_data["test"]["label"]))

[[563   3   1  11   3   2]
 [  3 657  35   3   0   3]
 [  2  27 123   1   0   0]
 [  7   2   0 252   6   0]
 [  6   0   0   8 200   9]
 [  0   6   0   0  15  52]]
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       583
           1       0.95      0.94      0.94       701
           2       0.77      0.80      0.79       153
           3       0.92      0.94      0.93       267
           4       0.89      0.90      0.89       223
           5       0.79      0.71      0.75        73

    accuracy                           0.92      2000
   macro avg       0.88      0.88      0.88      2000
weighted avg       0.92      0.92      0.92      2000

