In [1]:
import torch
from datasets import list_datasets, load_dataset
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments

from sklearn.metrics import accuracy_score, f1_score

In [2]:
# DEVICE
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
# PARAMS
num_labels = 6
model_ckpt = 'distilbert-base-uncased'


# DATA
emotions = load_dataset('emotion')
emotions


# TOKENS
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)


emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
emotions_encoded

No config specified, defaulting to: emotion/split
Found cached dataset emotion (/home/studio-lab-user/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd/cache-e5d13513aa45d6aa.arrow
Loading cached processed dataset at /home/studio-lab-user/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd/cache-aaf499e382280705.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [4]:
print(emotions_encoded['train'].format)
emotions_encoded.set_format('torch')
print(emotions_encoded['train'].format)

{'type': None, 'format_kwargs': {}, 'columns': ['text', 'label', 'input_ids', 'attention_mask'], 'output_all_columns': False}
{'type': 'torch', 'format_kwargs': {}, 'columns': ['text', 'label', 'input_ids', 'attention_mask'], 'output_all_columns': False}


In [5]:
# MODEL
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt, num_labels=num_labels
).to(device)

print('model.device', model.device)


def compute_metrics(pred):
    res = {'acc': 1.0, 'F1': 2.0}
    return res

batch_size=64

training_args = TrainingArguments(
    output_dir='resultados',
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    disable_tqdm=False,
    evaluation_strategy='steps',
    logging_steps=100,
    report_to='none',
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=emotions_encoded['train'],
    eval_dataset=emotions_encoded['validation'],
    tokenizer=tokenizer,
)

print('trainer.args.device:', trainer.args.device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

model.device cuda:0
trainer.args.device: cuda:0


In [6]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 16000
  Num Epochs = 2
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 500
  Number of trainable parameters = 66958086
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Acc,F1
100,1.2877,0.899443,1.0,2.0
200,0.6623,0.442033,1.0,2.0
300,0.3775,0.279302,1.0,2.0
400,0.2574,0.242472,1.0,2.0
500,0.2436,0.225239,1.0,2.0


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 

TrainOutput(global_step=500, training_loss=0.5656990127563477, metrics={'train_runtime': 231.5328, 'train_samples_per_second': 138.209, 'train_steps_per_second': 2.16, 'total_flos': 720342861696000.0, 'train_loss': 0.5656990127563477, 'epoch': 2.0})

In [7]:
trainer.save_model('modelo_ejemplo1')

Saving model checkpoint to modelo_ejemplo1
Configuration saved in modelo_ejemplo1/config.json
Model weights saved in modelo_ejemplo1/pytorch_model.bin
tokenizer config file saved in modelo_ejemplo1/tokenizer_config.json
Special tokens file saved in modelo_ejemplo1/special_tokens_map.json


In [9]:
res = trainer.predict(emotions_encoded['validation'])

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2000
  Batch size = 64


In [10]:
res.metrics

{'test_loss': 0.2252388745546341,
 'test_acc': 1.0,
 'test_F1': 2.0,
 'test_runtime': 3.8652,
 'test_samples_per_second': 517.434,
 'test_steps_per_second': 8.279}

In [11]:
res.predictions

array([[ 4.2283382 , -0.7476997 , -1.2339997 , -0.91514856, -0.8758585 ,
        -1.5739301 ],
       [ 4.220526  , -0.9486427 , -1.7250919 , -0.74755186, -0.3006278 ,
        -1.5723747 ],
       [-1.1296707 ,  1.9486657 ,  2.7164347 , -0.9532098 , -1.519536  ,
        -1.3485329 ],
       ...,
       [-1.2038524 ,  4.4840403 ,  0.32090122, -1.4759028 , -1.5415782 ,
        -0.919959  ],
       [-1.6846102 ,  2.5648055 ,  2.5989816 , -1.1963477 , -1.5772452 ,
        -0.9975323 ],
       [-1.5571036 ,  4.433269  ,  0.39210126, -1.619088  , -1.6981349 ,
        -0.2927862 ]], dtype=float32)