In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
torch.cuda.current_device()

0

In [4]:
from transformers import AutoTokenizer

In [8]:
checkpoint = 'bert-base-cased'

In [9]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)/main/tokenizer.json: 436kB [00:00, 1.75MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [10]:
texts = 'Text for tests!'

In [18]:
tokens = tokenizer.tokenize(texts)
tokens

['Text', 'for', 'tests', '!']

In [19]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[18430, 1111, 5715, 106]

In [20]:
ids_encoded = tokenizer.encode(texts)
ids_encoded

[101, 18430, 1111, 5715, 106, 102]

In [21]:
tokens_2 = tokenizer.convert_ids_to_tokens(ids_encoded)
tokens_2

['[CLS]', 'Text', 'for', 'tests', '!', '[SEP]']

In [22]:
tokenizer.decode(ids_encoded)

'[CLS] Text for tests! [SEP]'

In [24]:
tokenizer(texts, return_tensors='pt')

{'input_ids': tensor([[  101, 18430,  1111,  5715,   106,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [28]:
data = ['Text for tests!', 'Segundo sentence to test.']

In [29]:
model_inputs = tokenizer(
    data,
    padding=True,
    truncation=True,
    return_tensors='pt'
)

In [30]:
model_inputs

{'input_ids': tensor([[  101, 18430,  1111,  5715,   106,   102,     0,     0,     0],
        [  101, 22087, 11652,  2572,  5650,  1106,  2774,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [63]:
# pip install torchinfo

In [62]:
from torchinfo import summary

summary(model)

Layer (type:depth-idx)                                  Param #
BertForSequenceClassification                           --
├─BertModel: 1-1                                        --
│    └─BertEmbeddings: 2-1                              --
│    │    └─Embedding: 3-1                              22,268,928
│    │    └─Embedding: 3-2                              393,216
│    │    └─Embedding: 3-3                              1,536
│    │    └─LayerNorm: 3-4                              1,536
│    │    └─Dropout: 3-5                                --
│    └─BertEncoder: 2-2                                 --
│    │    └─ModuleList: 3-6                             85,054,464
│    └─BertPooler: 2-3                                  --
│    │    └─Linear: 3-7                                 590,592
│    │    └─Tanh: 3-8                                   --
├─Dropout: 1-2                                          --
├─Linear: 1-3                                           1,538
Total params: 10

In [65]:
summary(model, input_size=(16,512), dtypes=['torch.IntTensor'], device='cpu')

Layer (type:depth-idx)                                  Output Shape              Param #
BertForSequenceClassification                           [16, 2]                   --
├─BertModel: 1-1                                        [16, 768]                 --
│    └─BertEmbeddings: 2-1                              [16, 512, 768]            --
│    │    └─Embedding: 3-1                              [16, 512, 768]            22,268,928
│    │    └─Embedding: 3-2                              [16, 512, 768]            1,536
│    │    └─Embedding: 3-3                              [1, 512, 768]             393,216
│    │    └─LayerNorm: 3-4                              [16, 512, 768]            1,536
│    │    └─Dropout: 3-5                                [16, 512, 768]            --
│    └─BertEncoder: 2-2                                 [16, 512, 768]            --
│    │    └─ModuleList: 3-6                             --                        85,054,464
│    └─BertPooler: 2-3           

In [31]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initi

In [32]:
outputs = model(**model_inputs)

In [33]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.1347,  0.1550],
        [-0.0771, -0.0097]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [34]:
outputs.logits

tensor([[-0.1347,  0.1550],
        [-0.0771, -0.0097]], grad_fn=<AddmmBackward0>)

In [35]:
outputs['logits']

tensor([[-0.1347,  0.1550],
        [-0.0771, -0.0097]], grad_fn=<AddmmBackward0>)

In [36]:
outputs[0]

tensor([[-0.1347,  0.1550],
        [-0.0771, -0.0097]], grad_fn=<AddmmBackward0>)

In [39]:
import numpy as np

outputs.logits.detach().cpu().numpy()

array([[-0.13467437,  0.15498489],
       [-0.07705739, -0.00965607]], dtype=float32)

In [42]:
from datasets import load_dataset

In [43]:
raw_datasets = load_dataset('glue', 'sst2')

Downloading builder script: 28.8kB [00:00, 28.8MB/s]
Downloading metadata: 28.7kB [00:00, 28.7MB/s]
Downloading readme: 27.9kB [00:00, ?B/s]


Downloading and preparing dataset glue/sst2 to C:/Users/ftrav/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data: 100%|█████████████████████████████████████████████| 7.44M/7.44M [00:00<00:00, 56.4MB/s]
                                                                                                         

Dataset glue downloaded and prepared to C:/Users/ftrav/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


100%|█████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 124.99it/s]


In [44]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [45]:
raw_datasets['train']

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

In [46]:
def tokenize_fn(batch):
    return tokenizer(batch['sentence'], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)

                                                                                                         

In [47]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [52]:
from datasets import load_metric

metric = load_metric('glue', 'sst2')

  metric = load_metric('glue', 'sst2')
Downloading builder script: 5.76kB [00:00, ?B/s]                                                         


In [53]:
def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [54]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    'my_trainer',
    evaluation_strategy='epoch',
    save_strategy= 'epoch',
    num_train_epochs=1
)

In [55]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [56]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.2107,0.383028,0.904817


TrainOutput(global_step=8419, training_loss=0.21458465416678651, metrics={'train_runtime': 580.0088, 'train_samples_per_second': 116.117, 'train_steps_per_second': 14.515, 'total_flos': 1086838797485400.0, 'train_loss': 0.21458465416678651, 'epoch': 1.0})

In [57]:
trainer.save_model('my_saved_model_path')

In [59]:
from transformers import pipeline

clf = pipeline(
    'text-classification',
    model='my_saved_model_path',
    device=0
)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [72]:
clf('This movie is great!')

[{'label': 'LABEL_1', 'score': 0.9982436895370483}]

In [67]:
# !cat my_saved_model_path/config.json

In [69]:
import json

config_path = 'my_saved_model_path/config.json'
with open(config_path) as f:
    j = json.load(f)
    
j['id2label'] = {0: 'negative', 1: 'positive'}

with open(config_path, 'w') as f:
    json.dump(j, f, indent=2)

In [70]:
clf_labeled = pipeline(
    'text-classification',
    model='my_saved_model_path',
    device=0
)

In [71]:
clf_labeled('This movie is great!')

[{'label': 'positive', 'score': 0.9982436895370483}]