In [1]:
!python --version


Python 3.9.21


In [2]:
2+2

4

In [2]:
from transformers import BertConfig, BertModel
# Building the config
config = BertConfig()

# Building the model from the config
model = BertModel(config)


In [3]:
print(config)

BertConfig {
  "_attn_implementation_autoset": true,
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [4]:
# above model is untrained and hence will take time to train again and print gibberish most of the time. 
# So, to spend less time and enerty into it, we'll use pretrained model class 

from transformers import BertModel

# refer to this model card to know more - https://huggingface.co/google-bert/bert-base-cased
model = BertModel.from_pretrained("bert-base-cased")
# output downloaded below are default config parameters and pretrained weights (originally trained by the authors of the model)
# you can change download location via HF_HOME environment variable

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [5]:
# after extra training you can save the models like below 
model.save_pretrained("/home/dvipin2/dev/models/my-bert-base-cased")

In [6]:
"""
models only understand encoded tokens and not sentences, hence first task is to convert string into tokens

for example - my name is khan ====> [[1234,7865,3579,658]] 
and this encoded tokens will be fed to model for training or get the answers.
for more information please refer this link - https://huggingface.co/learn/nlp-course/chapter2/4?fw=pt
"""

'\nmodels only understand encoded tokens and not sentences, hence first task is to convert string into tokens\n\nfor example - my name is khan ====> [[1234,7865,3579,658]] \nand this encoded tokens will be fed to model for training or get the answers.\nfor more information please refer this link - https://huggingface.co/learn/nlp-course/chapter2/4?fw=pt\n'

In [8]:
# Now lets look at tokenizer loading mechanism, below both syntaxes are same
# what they differ is in flexibility

from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer("Using a Transformer network is simple")


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
# Autotokenizer

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenizer("Using a Transformer network is simple")
# in the output below bert tokenizer didn't download anything and started form the same checkpoint via AutoTokenizer and output is also same.

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}