# BERT Basic Process

In [1]:
import torch
from transformers import BertTokenizer, BertModel

# ===== 1. Setup =====
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

In [2]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name).to(device)  # Move model to GPU/MPS

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

In [3]:
# ===== 2. Tokenization =====
text = "BERT transforms NLP."
print(f"Original text: '{text}'")
print(f"Word tokenization (naive): {text.split()}")  # ['BERT', 'transforms', 'NLP.']

Original text: 'BERT transforms NLP.'
Word tokenization (naive): ['BERT', 'transforms', 'NLP.']


In [4]:
# Tokenize with BERT's WordPiece
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [5]:
input_ids = inputs["input_ids"]
input_ids.shape

torch.Size([1, 7])

In [10]:
inputs["token_type_ids"]

tensor([[0, 0, 0, 0, 0, 0, 0]], device='mps:0')

In [11]:
inputs["attention_mask"]

tensor([[1, 1, 1, 1, 1, 1, 1]], device='mps:0')

In [7]:
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

In [8]:
print("\n=== Tokenization Details ===")
print(f"Tokens: {tokens}")  # ['[CLS]', 'bert', 'transforms', 'nlp', '.', '[SEP]']
print(f"Token IDs: {input_ids[0].tolist()}")  # [101, 14324, 1908, 19396, 1012, 102]
print(f"Sequence length before/after tokenization: {len(text.split())} words → {len(tokens)} tokens")


=== Tokenization Details ===
Tokens: ['[CLS]', 'bert', 'transforms', 'nl', '##p', '.', '[SEP]']
Token IDs: [101, 14324, 21743, 17953, 2361, 1012, 102]
Sequence length before/after tokenization: 3 words → 7 tokens


In [12]:
# ===== 3. Model Inference =====
with torch.no_grad():
    outputs = model(**inputs)

In [13]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [14]:
# ===== 4. Embedding Extraction =====
last_hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_size]
pooler_output = outputs.pooler_output            # Shape: [batch_size, hidden_size]

- The pooler_output is a special representation that captures the meaning of the entire input sequence in a single vector
- It's obtained from the [CLS] token's hidden state (the first token) after passing it through a dense layer and tanh activation
- The shape is [batch_size, hidden_size] where:
- batch_size : number of sequences being processed at once (usually 1 if processing a single text)
- hidden_size : dimension of BERT's hidden representations (768 for bert-base-uncased)

In [15]:
print("\n=== Tensor Shapes ===")
print(f"Input IDs shape: {input_ids.shape}")               # [1, 6] (batch=1, 6 tokens)
print(f"Last hidden states shape: {last_hidden_states.shape}")  # [1, 6, 768]
print(f"Pooler output (CLS) shape: {pooler_output.shape}") # [1, 768]


=== Tensor Shapes ===
Input IDs shape: torch.Size([1, 7])
Last hidden states shape: torch.Size([1, 7, 768])
Pooler output (CLS) shape: torch.Size([1, 768])


In [16]:
# ===== 5. Key Embeddings =====
cls_embedding = last_hidden_states[0, 0, :]      # [CLS] token embedding (shape: [768])
sep_embedding = last_hidden_states[0, -1, :]     # [SEP] token embedding (shape: [768])
word_bert_embedding = last_hidden_states[0, 1, :] # Embedding for "bert" (shape: [768])

In [17]:
print("\n=== Embedding Samples ===")
print(f"[CLS] embedding (first 5 dims): {cls_embedding[:5].cpu().numpy()}")
print(f"[SEP] embedding (first 5 dims): {sep_embedding[:5].cpu().numpy()}")
print(f"Word 'bert' embedding (first 5 dims): {word_bert_embedding[:5].cpu().numpy()}")


=== Embedding Samples ===
[CLS] embedding (first 5 dims): [-0.6791783  -0.27145147  0.18689598 -0.30490595 -0.05804099]
[SEP] embedding (first 5 dims): [ 0.6691261  -0.08157957 -0.3835136   0.38256446 -0.29128885]
Word 'bert' embedding (first 5 dims): [ 0.2585698  -0.14524582  0.5503362  -0.27279094 -0.12074907]


# BERT for Sentence Classification

In [28]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

# ===== 1. Setup =====
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

# Load pre-trained model for sentiment analysis
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

# ===== 2. Define sample texts =====
texts = [
    "I absolutely loved this movie! The acting was fantastic.",
    "The product was okay, but could be better.",
    "This is the worst experience I've ever had. Terrible service!"
]

# ===== 3. Process and predict =====
def get_sentiment(text):
    # Tokenize and prepare input
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    
    # Get prediction
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = F.softmax(outputs.logits, dim=-1)
        
    # Convert prediction to rating (1-5 stars)
    rating = torch.argmax(predictions) + 1
    confidence = predictions[0][rating-1].item()
    
    return rating, confidence

# ===== 4. Test the model =====
print("=== Sentiment Analysis Results ===")
for text in texts:
    rating, confidence = get_sentiment(text)
    print(f"\nText: {text}")
    print(f"Rating: {rating} stars")
    print(f"Confidence: {confidence:.2%}")

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

=== Sentiment Analysis Results ===

Text: I absolutely loved this movie! The acting was fantastic.
Rating: 5 stars
Confidence: 93.70%

Text: The product was okay, but could be better.
Rating: 3 stars
Confidence: 77.18%

Text: This is the worst experience I've ever had. Terrible service!
Rating: 1 stars
Confidence: 97.04%


# BERT for Token Classification

In [30]:
from transformers import AutoTokenizer, BertForTokenClassification
import torch

# ===== 1. Setup =====
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = BertForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english").to(device)

inputs = tokenizer(
    "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt"
)

print(inputs.keys())

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [31]:
print(inputs['input_ids'].shape)

torch.Size([1, 13])


In [32]:
print(inputs['attention_mask'])

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [35]:
inputs=inputs.to(device)
with torch.no_grad():
    output = model(**inputs)

In [37]:
output.keys()

odict_keys(['logits'])

In [36]:
logits = output.logits

In [38]:
print(logits.shape)

torch.Size([1, 13, 9])


In [39]:
predicted_token_class_ids = logits.argmax(-1)
predicted_token_class_ids

tensor([[0, 6, 6, 6, 0, 0, 0, 0, 0, 8, 0, 8, 8]], device='mps:0')

In [40]:
print(predicted_token_class_ids.shape)

torch.Size([1, 13])


In [41]:
model.config.id2label

{0: 'O',
 1: 'B-MISC',
 2: 'I-MISC',
 3: 'B-PER',
 4: 'I-PER',
 5: 'B-ORG',
 6: 'I-ORG',
 7: 'B-LOC',
 8: 'I-LOC'}

In [42]:
# Note that tokens are classified rather then input words which means that
# there might be more predicted token classes than words.
# Multiple token classes might account for the same word
predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
predicted_tokens_classes

['O',
 'I-ORG',
 'I-ORG',
 'I-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'I-LOC',
 'O',
 'I-LOC',
 'I-LOC']

In [43]:
labels = predicted_token_class_ids
loss = model(**inputs, labels=labels).loss
round(loss.item(), 2)

0.01

# BERT for Masked Language Modeling

In [45]:
from transformers import AutoTokenizer, BertForMaskedLM
import torch

# ===== 1. Setup =====
if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = BertForMaskedLM.from_pretrained("google-bert/bert-base-uncased").to(device)

inputs = tokenizer("The capital of France is [MASK].", return_tensors="pt").to(device)

with torch.no_grad():
    logits = model(**inputs).logits

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with ano

In [46]:
print(logits.shape)

torch.Size([1, 9, 30522])


In [47]:
print(tokenizer.mask_token_id)

103


In [49]:
print(inputs.input_ids)

tensor([[ 101, 1996, 3007, 1997, 2605, 2003,  103, 1012,  102]],
       device='mps:0')


In [50]:
# retrieve index of [MASK]
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
mask_token_index

tensor([6], device='mps:0')

In [51]:
predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
predicted_token_id

tensor([3000], device='mps:0')

In [52]:
print(tokenizer.decode(predicted_token_id))

paris


In [57]:
labels = tokenizer("The capital of France is paris.", return_tensors="pt")["input_ids"].to(device)
# mask labels of non-[MASK] tokens
labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
labels

tensor([[-100, -100, -100, -100, -100, -100, 3000, -100, -100]],
       device='mps:0')

In [58]:
print(inputs.input_ids)

tensor([[ 101, 1996, 3007, 1997, 2605, 2003,  103, 1012,  102]],
       device='mps:0')


In [59]:
outputs = model(**inputs, labels=labels)
round(outputs.loss.item(), 2)

0.88

# BERT for Question Answering

In [13]:
from transformers import AutoTokenizer, BertForQuestionAnswering
import torch

tokenizer = AutoTokenizer.from_pretrained("deepset/bert-base-cased-squad2")
model = BertForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2")
model.save_pretrained('./output/deepset-bert-base-cased-squad2')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)

# target is "nice puppet"
target_start_index = torch.tensor([14])
target_end_index = torch.tensor([15])

outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
loss = outputs.loss
round(loss.item(), 2)

loading configuration file config.json from cache at /Users/kaikailiu/.cache/huggingface/hub/models--deepset--bert-base-cased-squad2/snapshots/2a255b76eb545d2f0c301feaa925c758b7b1f153/config.json
Model config BertConfig {
  "_name_or_path": "deepset/bert-base-cased-squad2",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "name": "Bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file vocab.txt from cache at /Users/kaikailiu/.cache/huggingface/hub/models--deeps

7.41

In [14]:
import os
modelfilepath=os.path.join('./output/deepset-bert-base-cased-squad2', 'savedmodel.pth')
torch.save({
            'model_state_dict': model.state_dict()
        }, modelfilepath)

In [3]:
from transformers import BertConfig, BertModel

# Initializing a BERT bert-base-uncased style configuration
configuration = BertConfig()

# Initializing a model (with random weights) from the bert-base-uncased style configuration
bertmodel = BertModel(configuration)

# Accessing the model configuration
configuration = bertmodel.config

In [9]:
from transformers.models.bert.modeling_bert import BertPreTrainedModel
from transformers.utils.generic import ModelOutput
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from dataclasses import dataclass

@dataclass
class QuestionAnsweringModelOutput(ModelOutput):
    """
    Base class for outputs of question answering models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
        start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Span-start scores (before SoftMax).
        end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Span-end scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    loss: Optional[torch.FloatTensor] = None
    start_logits: torch.FloatTensor = None
    end_logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

class myBertForQuestionAnswering(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config, add_pooling_layer=False)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.Tensor] = None,
        end_positions: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [21]:
configuration.hidden_size

768

In [22]:
configuration.num_labels

2

In [17]:
configuration.vocab_size = 28996
mybertqa = myBertForQuestionAnswering(config=configuration)

In [18]:
modelfilepath=os.path.join('./output/deepset-bert-base-cased-squad2', 'savedmodel.pth')
checkpoint = torch.load(modelfilepath, map_location='cpu')
mybertqa.load_state_dict(checkpoint['model_state_dict'])
embedding_size = mybertqa.get_input_embeddings().weight.shape[0]
print("Embeeding size:", embedding_size) #65001

Embeeding size: 28996


In [19]:
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = mybertqa(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
result=tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)

In [20]:
result

'a nice puppet'

https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py

In [23]:
from mybertmodel import loadsave_model, load_QAbertmodel
loadsave_model()

loading configuration file config.json from cache at /Users/kaikailiu/.cache/huggingface/hub/models--deepset--bert-base-cased-squad2/snapshots/2a255b76eb545d2f0c301feaa925c758b7b1f153/config.json
Model config BertConfig {
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "name": "Bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading weights file model.safetensors from cache at /Users/kaikailiu/.cache/huggingface/hub/models--deepset--bert-base-cased-squad2/snapshots/

In [24]:
mybertqa, tokenizer = load_QAbertmodel()

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


Embeeding size: 28996


In [25]:
mybertqa

myBertForQuestionAnswering(
  (bert): myBertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 