## Seq2seq with pre-trained BERT

This notebook shows some example of the BERT model used for seq2seq tasks.

In [1]:
from transformers import EncoderDecoderModel, BertTokenizer
import torch

### Example 1

Load tokenizer.

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Load the encoder-decoder model.

In [6]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
model

EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [4]:
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)

In [7]:
outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)

In [8]:
outputs

(tensor([[[ -6.3390,  -6.3665,  -6.4600,  ...,  -5.5355,  -4.1787,  -5.8384],
          [ -6.0605,  -6.0980,  -6.1492,  ...,  -5.0190,  -3.6619,  -5.6481],
          [ -6.2835,  -6.1857,  -6.2198,  ...,  -5.8243,  -3.9650,  -4.2239],
          ...,
          [ -8.6994,  -8.6061,  -8.6930,  ...,  -8.4026,  -7.0615,  -6.1120],
          [ -7.7221,  -7.7373,  -7.7094,  ...,  -7.6440,  -6.1568,  -5.5106],
          [-13.5756, -13.0523, -12.9125,  ..., -10.4893, -11.9085,  -9.3556]]],
        grad_fn=<AddBackward0>),
 tensor([[[-0.1144,  0.1937,  0.1250,  ..., -0.3827,  0.2107,  0.5407],
          [ 0.5308,  0.3207,  0.3665,  ..., -0.0036,  0.7579,  0.0388],
          [-0.4877,  0.8849,  0.4256,  ..., -0.6976,  0.4458,  0.1231],
          ...,
          [-0.7003, -0.1815,  0.3297,  ..., -0.4838,  0.0680,  0.8901],
          [-1.0355, -0.2567, -0.0317,  ...,  0.3197,  0.3999,  0.1795],
          [ 0.6080,  0.2610, -0.3131,  ...,  0.0311, -0.6283, -0.1994]]],
        grad_fn=<NativeLayerNormB