In [None]:
!pip install transformers sentencepiece datasets -q

In [2]:
from datasets import load_dataset

ds = load_dataset("emotion")
ds

Downloading builder script:   0%|          | 0.00/3.62k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.69k [00:00<?, ?B/s]

Downloading and preparing dataset emotion/default to /root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705...


Downloading data:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/204k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/207k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset emotion downloaded and prepared to /root/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [3]:
from datasets import concatenate_datasets

ds['validation'] = concatenate_datasets([ds['validation'], ds['test']])
del ds['test']
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 4000
    })
})

In [4]:
from transformers import AutoTokenizer

model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tok_func(x): return tokenizer(x["text"], padding=True, truncation=True)

tok_ds = ds.map(tok_func, batched=True)
tok_ds

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4000
    })
})

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [6]:
from datasets import load_metric
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

bs = 256
epochs = 5
lr = 1e-4

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

args = TrainingArguments('text-emotion', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True, 
                         evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2, 
                         num_train_epochs=epochs, weight_decay=0.01, logging_strategy='epoch', report_to='none',push_to_hub=True)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)

trainer = Trainer(model, args, train_dataset=tok_ds['train'], eval_dataset=tok_ds['validation'], 
                  tokenizer=tokenizer, compute_metrics=compute_metrics)

trainer.train();

  if __name__ == '__main__':


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifi

Epoch,Training Loss,Validation Loss,Accuracy
1,1.0232,0.242402,0.917
2,0.1925,0.159961,0.934
3,0.1134,0.141828,0.935
4,0.076,0.146129,0.931
5,0.0604,0.141419,0.93675


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 512
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 512
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples 

In [7]:
from transformers import TextClassificationPipeline

label_map={
    'LABEL_0':'🙁',
    'LABEL_1':'😃',
    'LABEL_2':'🥰',
    'LABEL_3':'😠',
    'LABEL_4':'😬',
    'LABEL_5':'😳'
  }

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=0)

text = 'my very first nlp model'
emotion = label_map[pipe(text)[0]['label']]
print(f'{text} {emotion}')

my very first nlp model 😃


In [None]:
trainer.push_to_hub()