<a href="https://colab.research.google.com/github/gmihaila/machine_learning_things/blob/master/learning_pytorch/pytorch_transformer_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Links

https://modelzoo.co/model/pytorch-pretrained-bert

### Installs

In [0]:
from IPython.display import clear_output

!pip install pytorch-transformers
clear_output()

### Imports

In [0]:
import torch
import numpy as np
from torchsummary import summary
from pytorch_transformers import *
from keras.preprocessing.sequence import pad_sequences

# PyTorch-Transformers has a unified API
# for 8 transformer architectures and 30 pretrained weights.

#          Model          | Tokenizer          | Pretrained weights shortcut
MODELS = [(BertModel,       BertTokenizer,       BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
          (OpenAIGPTModel,  OpenAIGPTTokenizer,  OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
          (GPT2Model,       GPT2Tokenizer,       GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
          (TransfoXLModel,  TransfoXLTokenizer,  TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
          (XLNetModel,      XLNetTokenizer,      XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
          (XLMModel,        XLMTokenizer,        XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
          (RobertaModel,    RobertaTokenizer,    ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
          (DistilBertModel, DistilBertTokenizer, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())]



list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())

Using TensorFlow backend.


['bert-base-uncased',
 'bert-large-uncased',
 'bert-base-cased',
 'bert-large-cased',
 'bert-base-multilingual-uncased',
 'bert-base-multilingual-cased',
 'bert-base-chinese',
 'bert-base-german-cased',
 'bert-large-uncased-whole-word-masking',
 'bert-large-cased-whole-word-masking',
 'bert-large-uncased-whole-word-masking-finetuned-squad',
 'bert-large-cased-whole-word-masking-finetuned-squad',
 'bert-base-cased-finetuned-mrpc']

### Load Model

In [0]:
pretrained_weights = 'bert-base-uncased'
use_model = BertForSequenceClassification
use_tokenizer = BertTokenizer

## tokenizer
tokenizer = use_tokenizer.from_pretrained(pretrained_weights,)
## model
model = use_model.from_pretrained(pretrained_weights,
                                  output_hidden_states=True, 
                                  output_attentions=True, 
                                  torchscript=True,
                                  num_labels=2)

print("Loaded ", pretrained_weights)

100%|██████████| 231508/231508 [00:00<00:00, 1177454.49B/s]
100%|██████████| 313/313 [00:00<00:00, 57627.72B/s]
100%|██████████| 440473133/440473133 [00:11<00:00, 38319041.81B/s]


Loaded  bert-base-uncased


In [0]:
## to check input shape and output shape
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_aff

### Example Sentence


In [0]:
sentence = ["[CLS] This is my first test. [SEP]"]

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentence]

tokenized_texts

[['[CLS]', 'this', 'is', 'my', 'first', 'test', '.', '[SEP]']]

In [0]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = 128

# Pad our input tokens
input_ids = pad_sequences([tokenizer.encode(sen) for sen in sentence], 
                          maxlen=MAX_LEN,
                          dtype='long',
                          truncating='post',
                          padding='post')

# Create attention masks
attention_masks = np.where(input_ids>0, 1, input_ids)

# Convert all of our data into torch tensors, the required datatype for our model
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
label = torch.tensor([1])

input_ids, attention_masks, label

(tensor([[ 101, 2023, 2003, 2026, 2034, 3231, 1012,  102,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0]]),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 

### Train model

In [0]:
model.train()

# Parameters:
lr = 1e-3
max_grad_norm = 1.0
num_total_steps = 1000
num_warmup_steps = 100
warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1

### In PyTorch-Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler


## train sequence:
optimizer.zero_grad()
model.train()

## for loop batch

# model outputs are always tuple in pytorch-transformers (see doc)
output = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=label)

loss = output[0]
loss.backward()

torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)

optimizer.step()
# Update learning rate schedule
scheduler.step()
optimizer.zero_grad()

### Predict

In [0]:
# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

with torch.no_grad():
  logits = model(input_ids, token_type_ids=None, attention_mask=attention_masks)

# Move logits and labels to CPU


pred_flat = np.argmax(logits[0], axis=1).flatten().numpy()

print(pred_flat)

[0]


In [0]:
torch.softmax(logits[0],1)

tensor([[0.6654, 0.3346]])

In [0]:
(logits[0])


tensor([[ 0.4153, -0.2721]])

### Evaluate

In [0]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
  pred_flat = preds.numpy()
  pred_flat = np.argmax(pred_flat, axis=1).flatten()

  labels_flat = labels.flatten()
  labels_flat = labels_flat.numpy()

  return np.sum(pred_flat == labels_flat) / len(labels_flat)



# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

with torch.no_grad():
  logits = model(input_ids, token_type_ids=None, attention_mask=attention_masks)

flat_accuracy(logits[0], label)

1.0