In [2]:
#Masked Language modelling : perform word prediction



from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

text = '2023 will be a great year for all of us'
encoding = tokenizer.encode_plus(text, add_special_tokens = True, truncation = True, padding = "max_length", return_attention_mask = True, return_tensors = "pt")
#print(encoding) #dictionary including input_ids, attention_mask, and token_type_ids
input = encoding["input_ids"][0]
attention_mask = encoding["attention_mask"][0]
#tokenizer.mask_token

In [3]:
from transformers import BertTokenizer, BertForMaskedLM
from torch.nn import functional as F
import torch


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertForMaskedLM.from_pretrained('bert-base-uncased', return_dict = True)

''' Masked Language Modeling works by inserting a mask token at the desired position
 where you want to predict the best candidate word that would go in that position.

You can simply insert the mask token by concatenating it at the desired position

The Bert Model for Masked Language Modeling predicts the best word/token 
in its vocabulary that would replace that word. 

The logits are the output of the BERT Model before a softmax activation function
 is applied to the output of BERT. 
i.e. logits are the Prediction scores of the language modeling head 
(scores for each vocabulary token before SoftMax).

And in order to get the logits, we have to specify "return_dict = True" 
in the parameters when initializing the model, 
otherwise, the above code will result in a compilation error. 

"return_dict" - If set to True, the model will return a ModelOutput class instead of a plain tuple.

'''

text = "The Opera House in Australia is in , " + tokenizer.mask_token + " city"

input = tokenizer.encode_plus(text, return_tensors = "pt")


''' In order to get the tensor of softmax values of all the words in BERT’s
vocabulary for replacing the mask token, we need to specify the masked token index.

And these we can get using torch.where(). And in this particular example 
I am retrieving the top 10 candidate replacement words for the mask token. '''
mask_index = torch.where(input["input_ids"][0] == tokenizer.mask_token_id)

''' mask_token (str or tokenizers.AddedToken, optional) — A special token 
representing a masked token (used by masked-language modeling pretraining objectives, like BERT). 
Will be associated to self.mask_token and self.mask_token_id. '''

output = model(**input)

logits = output.logits
# print(logits.shape) -> (1, 12, 30522)
''' After we pass the input encoding into the BERT Model, 
we can get the logits simply by specifying output.logits, which returns a tensor, 
and after this we can finally apply a softmax activation function to the logits. '''

softmax = F.softmax(logits, dim = -1)
''' By applying a softmax onto the output of BERT, 
we get probabilistic distributions for each of the words in BERT’s vocabulary.
Word’s with a higher probability value will be better candidate replacement words 
for the mask token.  '''

mask_word = softmax[0, mask_index, :]
''' In order to get the tensor of softmax values of all the words in BERT’s vocabulary 
for replacing the mask token, we can specify the masked token index, 
which we already got using torch.where(). '''



#retrieving the top 10 candidate replacement words for the mask token. 
top_10 = torch.topk(mask_word, 10, dim = 1)[1][0]
'''torch.topk() retrieves the top k values in a given tensor, 
 and it returns a tensor containing those top k values. '''

'''Iterate through the tensor and replace the mask token in the sentence with the candidate token. '''
for token in top_10:
   word = tokenizer.decode([token])
   new_sentence = text.replace(tokenizer.mask_token, word)
   print(new_sentence)

top_word = torch.argmax(mask_word, dim=1) #returns token_id
print(tokenizer.decode(top_word))


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The Opera House in Australia is in , sydney city
The Opera House in Australia is in , melbourne city
The Opera House in Australia is in , brisbane city
The Opera House in Australia is in , adelaide city
The Opera House in Australia is in , the city
The Opera House in Australia is in , canberra city
The Opera House in Australia is in , auckland city
The Opera House in Australia is in , hobart city
The Opera House in Australia is in , griffith city
The Opera House in Australia is in , hume city
sydney


Next Sentence Prediction
Next Sentence Prediction is the task of predicting whether one sentence follows another sentence.

BertForNextSentencePrediction
It returns logits (torch.FloatTensor of shape (batch_size, 2)) – Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).


In [4]:

from transformers import BertTokenizer, BertForNextSentencePrediction
import torch
from torch.nn import functional as F
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

prompt = "I came back from Office in the evening"

next_sentence = "I opened my Beer after Office"

#BERT tokenizer automatically inserts a [SEP] token in between the sentences
encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='pt')
outputs = model(**encoding)[0]
softmax = F.softmax(outputs, dim = 1)
print(softmax)

#Bert returns two values in a tensor: 
# the first value represents whether the second sentence is a continuation of the first
# second value represents whether the second sentence is a random sequence AKA not a good continuation of the first.

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[9.9998e-01, 1.5085e-05]], grad_fn=<SoftmaxBackward0>)


In [5]:
#Question Answering
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

tokenizer = BertTokenizer.from_pretrained("deepset/bert-base-cased-squad2")
model = BertForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2")

example_text = "GPT-3 came in 2020"

example_question = "When did GPT-3 come"

# We can use our tokenizer to automatically generate 2 sentence by passing the
# two sequences to tokenizer as two arguments
tokenized_inputs = tokenizer(example_question, example_text, return_tensors="pt")
print(tokenized_inputs)


{'input_ids': tensor([[  101,  1332,  1225, 15175,  1942,   118,   124,  1435,   102, 15175,
          1942,   118,   124,  1338,  1107, 12795,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


Bert QA appends question before context.

Tokenizer returns 3 tensors for us.

“inputs_ids” are tokenized ids of text.
"'token_type_ids' => To understand them first note, Some models’ purpose is to do classification on pairs of sentences or question answering.
https://huggingface.co/docs/transformers/v4.20.1/en/glossary#token-type-ids

These require two different sequences to be joined in a single “input_ids” entry, which usually is performed with the help of special tokens, such as the classifier ([CLS]) and separator ([SEP]) tokens. For example, the BERT model builds its two sequence input as such:

[CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
We used our tokenizer to automatically generate such a sentence by passing the two sequences to tokenizer as two arguments

BERT has token type IDs (also called segment IDs). They are represented as a binary mask identifying the two types of sequence in the model.

Here those 2 types of sequences are Questions and the Context. Token type 0 is for question part and 1 context.

The model will tell you at what start and end position of the input_ids the answer to the question will be located.



In [6]:

text = "The Vatican Apostolic Library (), more commonly called the Vatican Library or simply the Vat, is the library of the Holy See, located in Vatican City. Formally established in 1475, although it is much older, it is one of the oldest libraries in the world and contains one of the most significant collections of historical texts. It has 75,000 codices from throughout history, as well as 1.1 million printed books, which include some 8,500 incunabula.   The Vatican Library is a research library for history, law, philosophy, science and theology. The Vatican Library is open to anyone who can document their qualifications and research needs. Photocopies for private study of pages from books published between 1801 and 1990 can be requested in person or by mail.   In March 2014, the Vatican Library began an initial four-year project of digitising its collection of manuscripts, to be made available online.   The Vatican Secret Archives were separated from the library at the beginning of the 17th century; they contain another 150,000 items.   Scholars have traditionally divided the history of the library into five periods, Pre-Lateran, Lateran, Avignon, Pre-Vatican and Vatican.   The Pre-Lateran period, comprising the initial days of the library, dated from the earliest days of the Church. Only a handful of volumes survive from this period, though some are very significant."

question = "When was the Vat formally opened?"

tokenizer = BertTokenizer.from_pretrained("deepset/bert-base-cased-squad2")
model = BertForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2")

tokenized_inputs = tokenizer(question, text, return_tensors="pt")

with torch.no_grad():
    outputs = model(**tokenized_inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

''' start_logits (torch.FloatTensor of shape (batch_size, sequence_length)) 
— Span-start scores (before SoftMax).

end_logits (torch.FloatTensor of shape (batch_size, sequence_length)) 
— Span-end scores (before SoftMax). '''

predict_answer_tokens = tokenized_inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'1475'