#### A Closer Look at Creating and using a Model

In [1]:
# The first thing we’ll need to do to initialize a BERT model is load a configuration object
from transformers import BertConfig, BertModel
config= BertConfig()
print(config)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [2]:
model = BertModel(config) # This Model is init with Randomized weights and will output gibberish

In [5]:
import os
checkpoint= 'bert-base-cased' #This can be used for MLM (Maked Language Modeling) and NSP (Next Sentence Prediction)
model = BertModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
model.save_pretrained(os.getcwd())

In [14]:
from transformers import AutoModel
# mpath = os.path.join(os.getcwd(),'model')
#print(mpath)
# Loafing the model from Disk. Use only relatively Paths
from transformers import AutoModel
model = AutoModel.from_pretrained('.\model',local_files_only=True)

In [15]:
from transformers import AutoTokenizer
import torch
sequences = ["Hello!", "Cool.", "Nice!"]
tknzr = AutoTokenizer.from_pretrained(checkpoint)
encoded = tknzr(sequences, padding=True)

In [17]:
#print(encoded.keys())
encoded_seq = encoded['input_ids']
print(encoded_seq)

[[101, 8667, 106, 102], [101, 13297, 119, 102], [101, 8835, 106, 102]]


In [18]:
model_inputs = torch.tensor(encoded_seq)
output = model(model_inputs)
print(output.keys())

odict_keys(['last_hidden_state', 'pooler_output'])


##### Tokenizers

In [27]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentence = "Using a Transformer network is simple"

In [29]:
encoded = tokenizer(sentence)
print(encoded)

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}


tokenizer (sentence) calls the __call__ method of the tokenizer which is very powerful and perform various operations under the hood
this includes 
1- tokenization
2- conversion to ints
3- using attention mask for contextual embeddings

In [30]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
sentence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sentence)
print(tokens)

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']


In [31]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[7993, 170, 13809, 23763, 2443, 1110, 3014]


Decoding is going the other way i.e. from vocabulary indices to string

In [32]:
decoded_string = tokenizer.decode([7993, 170, 13809, 23763, 2443, 1110, 3014])
print(decoded_string)

Using a Transformer network is simple
