In [9]:
from transformers import AutoModel, AutoTokenizer

In [10]:
tokenizer = AutoTokenizer.from_pretrained('google/electra-base-discriminator')
print(tokenizer.model_input_names)
print(tokenizer.all_special_tokens)

['input_ids', 'token_type_ids', 'attention_mask']
['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']


In [11]:
model = AutoModel.from_pretrained("google/electra-base-discriminator")
model

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ElectraModel(
  (embeddings): ElectraEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): ElectraEncoder(
    (layer): ModuleList(
      (0): ElectraLayer(
        (attention): ElectraAttention(
          (self): ElectraSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): ElectraSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inpl

In [12]:
model.config

ElectraConfig {
  "_name_or_path": "google/electra-base-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [13]:
for name, para in model.named_parameters():
    print(name)

embeddings.word_embeddings.weight
embeddings.position_embeddings.weight
embeddings.token_type_embeddings.weight
embeddings.LayerNorm.weight
embeddings.LayerNorm.bias
encoder.layer.0.attention.self.query.weight
encoder.layer.0.attention.self.query.bias
encoder.layer.0.attention.self.key.weight
encoder.layer.0.attention.self.key.bias
encoder.layer.0.attention.self.value.weight
encoder.layer.0.attention.self.value.bias
encoder.layer.0.attention.output.dense.weight
encoder.layer.0.attention.output.dense.bias
encoder.layer.0.attention.output.LayerNorm.weight
encoder.layer.0.attention.output.LayerNorm.bias
encoder.layer.0.intermediate.dense.weight
encoder.layer.0.intermediate.dense.bias
encoder.layer.0.output.dense.weight
encoder.layer.0.output.dense.bias
encoder.layer.0.output.LayerNorm.weight
encoder.layer.0.output.LayerNorm.bias
encoder.layer.1.attention.self.query.weight
encoder.layer.1.attention.self.query.bias
encoder.layer.1.attention.self.key.weight
encoder.layer.1.attention.self.key

In [14]:
text = "i love you"
encoded_input = tokenizer(text, return_tensors='pt')
print(encoded_input)
print(encoded_input.keys())
print(encoded_input['input_ids'].shape)  # [1, 5]

{'input_ids': tensor([[ 101, 1045, 2293, 2017,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
torch.Size([1, 5])


In [15]:
result = model(**encoded_input)
result

BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[-0.1023,  0.0133, -0.1458,  ..., -0.5055, -0.1431,  0.1421],
         [-0.5768, -0.0330,  0.5017,  ..., -0.1052,  0.0763,  0.1921],
         [-0.4427,  0.0047,  0.0832,  ..., -0.3500, -0.4566,  0.1960],
         [ 0.1296, -0.0593,  0.3210,  ..., -0.1299, -0.3550,  0.6285],
         [-0.1023,  0.0133, -0.1458,  ..., -0.5055, -0.1431,  0.1421]]],
       grad_fn=<NativeLayerNormBackward0>), past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)

In [16]:
print(result.last_hidden_state)
print(result.last_hidden_state.shape)  # [1, 5, 768](限制input_ids最大长度为512)


tensor([[[-0.1023,  0.0133, -0.1458,  ..., -0.5055, -0.1431,  0.1421],
         [-0.5768, -0.0330,  0.5017,  ..., -0.1052,  0.0763,  0.1921],
         [-0.4427,  0.0047,  0.0832,  ..., -0.3500, -0.4566,  0.1960],
         [ 0.1296, -0.0593,  0.3210,  ..., -0.1299, -0.3550,  0.6285],
         [-0.1023,  0.0133, -0.1458,  ..., -0.5055, -0.1431,  0.1421]]],
       grad_fn=<NativeLayerNormBackward0>)
torch.Size([1, 5, 768])
