In [14]:
import torch
from transformers import BertConfig, BertModel, AutoModel, AutoTokenizer

# Create the model using random initial parameter values

In [2]:
config = BertConfig()
model = BertModel(config)

In [3]:
print(config)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.31.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



# Create model from pretrained model checkpoint

In [4]:
checkpoint = "bert-base-cased"

In [5]:
model = BertModel.from_pretrained(checkpoint)

## Model-agnostic code

In [6]:
model = AutoModel.from_pretrained(checkpoint)

# Save out a model

In [9]:
model.save_pretrained("../temp/02_models_bert/")

# Perform inference

In [13]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [31]:
raw_inputs = ["Hi, there!", "Cool.", "Nice job!"]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")

Without attention masks

In [32]:
outputs = model(inputs.input_ids)

In [33]:
print(outputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.6236,  0.3565,  0.5802,  ..., -0.3294,  0.5388, -0.2190],
         [ 0.6428, -0.4094,  0.7732,  ..., -0.3426,  0.3734, -0.1282],
         [ 0.6879,  0.2467,  1.0143,  ...,  0.1481, -0.2785, -0.0918],
         [ 0.7638, -0.3586,  0.2692,  ..., -0.2426,  0.5224,  0.2166],
         [ 0.6864, -0.1121,  0.6731,  ...,  0.1787,  0.3986,  0.0682],
         [ 1.1965,  0.3015, -0.1025,  ...,  0.4769,  1.0894, -0.0892]],

        [[ 0.2361,  0.1350,  0.0641,  ..., -0.0991,  0.2830, -0.0489],
         [ 0.1867, -0.3937,  0.8346,  ..., -0.3495,  0.2754,  0.2584],
         [ 0.2668,  0.3711,  0.5993,  ...,  0.2455,  0.1631,  0.0813],
         [ 0.7885,  0.7260, -0.1358,  ..., -0.1239,  0.7309, -0.4783],
         [-0.4073,  0.1227,  0.4143,  ...,  0.3392,  0.1903,  0.5902],
         [-0.2325, -0.0753,  0.5208,  ..., -0.3401,  0.0299,  0.4585]],

        [[ 0.5610,  0.3537,  0.1751,  ..., -0.2953,  0.3454, -0.1724],
         [

With attention masks

In [34]:
outputs = model(**inputs)

In [35]:
print(outputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.6236,  0.3565,  0.5802,  ..., -0.3294,  0.5388, -0.2190],
         [ 0.6428, -0.4094,  0.7732,  ..., -0.3426,  0.3734, -0.1282],
         [ 0.6879,  0.2467,  1.0143,  ...,  0.1481, -0.2785, -0.0918],
         [ 0.7638, -0.3586,  0.2692,  ..., -0.2426,  0.5224,  0.2166],
         [ 0.6864, -0.1121,  0.6731,  ...,  0.1787,  0.3986,  0.0682],
         [ 1.1965,  0.3015, -0.1025,  ...,  0.4769,  1.0894, -0.0892]],

        [[ 0.3128,  0.1718,  0.2099,  ..., -0.0721,  0.4919, -0.1383],
         [ 0.1545, -0.3757,  0.7187,  ..., -0.3130,  0.2822,  0.1883],
         [ 0.4123,  0.3721,  0.5484,  ...,  0.0788,  0.5681, -0.2757],
         [ 0.8356,  0.3964, -0.4121,  ...,  0.1838,  1.6365, -0.4806],
         [-0.0193,  0.1555, -0.0885,  ...,  0.0891,  0.4179,  0.2057],
         [-0.2468,  0.0290,  0.3317,  ..., -0.1685,  0.3317,  0.2554]],

        [[ 0.5204,  0.2707,  0.1696,  ..., -0.1170,  0.3442, -0.0916],
         [