In [1]:
# Sample of BertModel

#!pip install transformers

### Install pytorch
#https://pytorch.org/get-started/locally/
#https://pytorch.org/hub/huggingface_pytorch-transformers/

In [None]:
### Attention Is All You Need
#https://arxiv.org/abs/1706.03762

### BERT: Pre-training of Deep Bidirectional Transformers 
### for Language Understanding
#BERT(Bidirectional Encoder Representations from Transformers)
#https://arxiv.org/abs/1810.04805

In [2]:
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn

BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "bert-base-uncased",
    "bert-large-uncased",
    "bert-base-cased",
    "bert-large-cased",
    "bert-base-multilingual-uncased",
    "bert-base-multilingual-cased",
    "bert-base-chinese",
    # See all BERT models at https://huggingface.co/models?filter=bert
]

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Corpus = [
    "I am not smart but not a fool.",
    "She is very pretty but not my girl friend."
]

inputs = tokenizer(Corpus, return_tensors="pt",
                    max_length=15, padding=True, truncation=True)

input_ids = inputs['input_ids']
token_type_ids = inputs['token_type_ids']
attention_mask = inputs['attention_mask']

In [3]:
input_ids

tensor([[ 101, 1045, 2572, 2025, 6047, 2021, 2025, 1037, 7966, 1012,  102,    0],
        [ 101, 2016, 2003, 2200, 3492, 2021, 2025, 2026, 2611, 2767, 1012,  102]])

In [4]:
token_type_ids

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [5]:
attention_mask

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [6]:
vocab = tokenizer.vocab
print("Dictionary size：", len(vocab))

Dictionary size： 30522


In [7]:
vocab['i'], vocab['am'], vocab['.'], vocab['[PAD]']

(1045, 2572, 1012, 0)

In [8]:
index_word = {index:word for word, index in vocab.items()}

index_word[101], index_word[1045], index_word[2572], "...", \
index_word[1037], index_word[7966], index_word[1012], index_word[102], index_word[0]

('[CLS]', 'i', 'am', '...', 'a', 'fool', '.', '[SEP]', '[PAD]')

In [9]:
index_word[101], index_word[2016], index_word[2003], "...", \
index_word[2026], index_word[2611], index_word[2767], index_word[1012], index_word[102]

('[CLS]', 'she', 'is', '...', 'my', 'girl', 'friend', '.', '[SEP]')

In [10]:
# return_dict = True

model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)

outputs = model(input_ids, token_type_ids, attention_mask)

outputs    # type(outputs) = transformers.modeling_outputs.BaseModelOutputWithPooling

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.1046,  0.1310, -0.0827,  ..., -0.0219,  0.2136,  0.2302],
         [-0.5176,  0.1067, -0.2878,  ..., -0.6373,  0.3295,  0.0065],
         [-0.4615,  0.0065, -0.4400,  ..., -0.8397,  0.2282,  0.0466],
         ...,
         [ 0.6535,  0.1053, -0.4081,  ...,  0.0280, -0.3853, -0.3480],
         [ 0.5521,  0.0689, -0.2095,  ...,  0.0432, -0.4177, -0.3825],
         [ 0.1398,  0.1865,  0.3135,  ...,  0.2208,  0.1608, -0.0346]],

        [[-0.2987, -0.3447, -0.4963,  ..., -0.1937,  0.3750,  0.5720],
         [-0.2361, -0.9089, -0.0500,  ..., -0.4122,  0.2629,  0.2248],
         [-0.0292, -0.6746, -0.2885,  ..., -0.7879,  0.0873,  0.5768],
         ...,
         [-0.5918, -1.5879,  0.3443,  ...,  0.6541, -0.0065,  0.4885],
         [ 0.5235, -0.1575, -0.3188,  ..., -0.0518, -0.2579, -0.4995],
         [ 0.4285, -0.1204, -0.0739,  ..., -0.0077, -0.3160, -0.5173]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[-0.60

In [11]:
sequence_output = outputs.last_hidden_state
pooler_output = outputs.pooler_output
hidden_states = outputs.hidden_states
attentions = outputs.attentions

pooler_output

tensor([[-0.6052, -0.1877,  0.5065,  ...,  0.5284, -0.4884,  0.7288],
        [-0.4933, -0.3037, -0.4900,  ..., -0.1944, -0.5672,  0.8081]],
       grad_fn=<TanhBackward>)

In [12]:
sequence_output.size()

torch.Size([2, 12, 768])

In [13]:
pooler_output.size()

torch.Size([2, 768])

In [14]:
'''
class BertPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output
'''

model.pooler

BertPooler(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (activation): Tanh()
)

In [15]:
model.pooler(sequence_output)

tensor([[-0.6052, -0.1877,  0.5065,  ...,  0.5284, -0.4884,  0.7288],
        [-0.4933, -0.3037, -0.4900,  ..., -0.1944, -0.5672,  0.8081]],
       grad_fn=<TanhBackward>)

In [16]:
# return_dict = False

model = BertModel.from_pretrained('bert-base-uncased', return_dict=False)

outputs = model(input_ids, token_type_ids, attention_mask)

sequence_output, pooled_output = outputs   # type(outputs) = tuple

pooled_output

tensor([[-0.6052, -0.1877,  0.5065,  ...,  0.5284, -0.4884,  0.7288],
        [-0.4933, -0.3037, -0.4900,  ..., -0.1944, -0.5672,  0.8081]],
       grad_fn=<TanhBackward>)

In [17]:
#Bert For Sequence Classification
#https://huggingface.co/transformers/model_doc/bert.html#bertforsequenceclassification

from transformers import BertTokenizer, BertForSequenceClassification
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', return_dict=True, num_labels=2)

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1

outputs = model(**inputs, labels=labels)

loss = outputs.loss
logits = outputs.logits

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [18]:
model.train()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
#criterion = nn.CrossEntropyLoss()

EPOCHS = 6  
for epoch in range(EPOCHS):
    optimizer.zero_grad()
    
    inputs = tokenizer(["Hello, my dog is cute",
                        "Sorry, your key is lost"], return_tensors="pt")
    labels = torch.tensor([1, 
                           0]).unsqueeze(0)  # Batch size 1
    
    # forward
    outputs = model(**inputs, labels=labels)

    
    logits = outputs.logits
    
    loss = outputs.loss # the model will return if labels were given.
    #loss = criterion(logits, labels)

    # backward
    loss.backward()
    optimizer.step()

    print(logits, loss)

tensor([[-0.6229,  0.0841],
        [-0.4527,  0.5469]], grad_fn=<AddmmBackward>) tensor(0.8569, grad_fn=<NllLossBackward>)
tensor([[-0.5297,  0.3236],
        [-0.5164,  0.2491]], grad_fn=<AddmmBackward>) tensor(0.7512, grad_fn=<NllLossBackward>)
tensor([[-0.0234,  0.1586],
        [-0.4600,  0.2539]], grad_fn=<AddmmBackward>) tensor(0.8594, grad_fn=<NllLossBackward>)
tensor([[-0.3886,  0.3053],
        [-0.2438,  0.1043]], grad_fn=<AddmmBackward>) tensor(0.6438, grad_fn=<NllLossBackward>)
tensor([[-0.5933,  0.6831],
        [ 0.2813, -0.3358]], grad_fn=<AddmmBackward>) tensor(0.3388, grad_fn=<NllLossBackward>)
tensor([[-0.2984,  0.4017],
        [ 0.1415, -0.1080]], grad_fn=<AddmmBackward>) tensor(0.4897, grad_fn=<NllLossBackward>)


In [19]:
# Define a custom pytorch model 
class Bert_CNN_Tabular_Classification(nn.Module):
    
    def __init__(self):
        super(Bert_CNN_Tabular_Classification, self).__init__()
        
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        bert_out_features = 768
        for param in self.bert.parameters():
            param.requires_grad = False
        
        cnn_out_features = 500
        self.cnn = models.resnet50(pretrained=True)
        for param in self.cnn.parameters():
            param.requires_grad = False
        self.cnn.fc = nn.Linear(self.cnn.fc.in_features, cnn_out_features)        
        
        tabular_features = 381
        self.classifier = nn.Linear(bert_out_features + cnn_out_features + tabular_features, 5)

    def forward(
        self,
        input_tabular=None,
        input_image=None,        
        input_ids=None,
        attention_mask=None,
        token_type_ids=None):
        
        sequence_output, pooled_output = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )

        text_output = self.dropout(pooled_output)
        
        image_output = self.cnn(input_image)
    
        total = torch.cat([text_output, image_output, input_tabular.float()], axis = 1)
        
        logits = self.classifier(total)

        return logits

In [20]:
# AutoModelForQuestionAnswering

from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""

questions = [
    "How many pretrained models are available in Transformers?",
    "What does Transformers provide?",
    "Transformers provides interoperability between which frameworks?",
]

for question in questions:
    inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer_start_scores, answer_end_scores = model(**inputs)

    answer_start = torch.argmax(
        answer_start_scores
    )  # Get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    print(f"Question: {question}")
    print(f"Answer: {answer}\n")

Question: How many pretrained models are available in Transformers?
Answer: over 32 +

Question: What does Transformers provide?
Answer: general - purpose architectures

Question: Transformers provides interoperability between which frameworks?
Answer: tensorflow 2 . 0 and pytorch



In [21]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")

text = r"""
The Amazon rainforest (Portuguese: Floresta Amazônica or Amazônia; Spanish: Selva Amazónica, Amazonía or usually Amazonia; 
French: Forêt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist 
broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres
(2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes
territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, 
followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and 
French Guiana. States or departments in four nations contain "Amazonas" in their names. The Amazon represents over half of 
the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world,
with an estimated 390 billion individual trees divided into 16,000 species
"""

questions = [
    "Which name is also used to describe the Amazon rainforest in English?"
]

for question in questions:
    inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer_start_scores, answer_end_scores = model(**inputs)

    answer_start = torch.argmax(
        answer_start_scores
    )  # Get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    print(f"Question: {question}")
    print(f"Answer: {answer}\n")

Question: Which name is also used to describe the Amazon rainforest in English?
Answer: amazonia



In [22]:
from transformers import pipeline

# Allocate a pipeline for sentiment-analysis
classifier = pipeline('sentiment-analysis')

classifier('We are very happy to include pipeline into the transformers repository.')

[{'label': 'POSITIVE', 'score': 0.9978193640708923}]

In [23]:
from transformers import pipeline

# Allocate a pipeline for question-answering
question_answerer = pipeline('question-answering')

question_answerer({
    'question': 'What is the name of the repository ?',
    'context': 'Pipeline have been included in the huggingface/transformers repository'
})

{'score': 0.5135967135429382,
 'start': 35,
 'end': 59,
 'answer': 'huggingface/transformers'}

In [24]:
#https://pytorch.org/hub/pytorch_fairseq_translation/

from transformers import pipeline

translator = pipeline("translation_en_to_de")

print(translator("Hugging Face is a technology company based in New York and Paris", max_length=40))

[{'translation_text': 'Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.'}]
