In [1]:
!pip install transformers -q

[K     |████████████████████████████████| 4.9 MB 7.3 MB/s 
[K     |████████████████████████████████| 163 kB 62.7 MB/s 
[K     |████████████████████████████████| 6.6 MB 37.5 MB/s 
[?25h

In [3]:
from transformers import DistilBertTokenizer, DistilBertModel
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")
text = "some text for model"
encoded_input = tokenizer(text, return_tensors="pt")
output = model(**encoded_input) # This is a keyword argument for model
output

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BaseModelOutput(last_hidden_state=tensor([[[-0.3349, -0.1658, -0.0110,  ..., -0.2789,  0.0950,  0.4630],
         [-0.0397, -0.0176, -0.3552,  ..., -0.2689,  0.1300,  0.5941],
         [-0.0757,  0.3424,  0.0660,  ..., -0.5298, -0.2513,  0.1726],
         [ 0.0395, -0.0122,  0.2089,  ..., -0.3721,  0.1763,  0.1254],
         [-0.2027, -0.2934, -0.1556,  ..., -0.5440, -0.0524, -0.0880],
         [ 0.9069,  0.1240, -0.5695,  ...,  0.0775, -0.7602, -0.2034]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [4]:
model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

In [5]:
output.last_hidden_state.shape # The output contains six 768 dimensional vectors. The six comes for the vector for each token (include SOS, EOS)

torch.Size([1, 6, 768])

In [6]:
encoded_input #Here you can see the six token tensors.

{'input_ids': tensor([[ 101, 2070, 3793, 2005, 2944,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [7]:
#Auto models behave the same way, but you do not need to remember the name of each library
from transformers import AutoTokenizer, AutoModel
model = AutoModel.from_pretrained('distilbert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
text = "some text for model"
encoded_input = tokenizer(text, return_tensors="pt")
output = model(**encoded_input)
output

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

BaseModelOutput(last_hidden_state=tensor([[[-0.3349, -0.1658, -0.0110,  ..., -0.2789,  0.0950,  0.4630],
         [-0.0397, -0.0176, -0.3552,  ..., -0.2689,  0.1300,  0.5941],
         [-0.0757,  0.3424,  0.0660,  ..., -0.5298, -0.2513,  0.1726],
         [ 0.0395, -0.0122,  0.2089,  ..., -0.3721,  0.1763,  0.1254],
         [-0.2027, -0.2934, -0.1556,  ..., -0.5440, -0.0524, -0.0880],
         [ 0.9069,  0.1240, -0.5695,  ...,  0.0775, -0.7602, -0.2034]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [8]:
#There are also classes defined for specific tasks
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [9]:
model 
#One can see the last layer has a ClassificationHead. This is the New Head which comes from the Transfer Learning. It replaces the last
#layers of the original models to do an specific task.

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [10]:
text = "I enjoyed movie"
encoded_input = tokenizer(text, return_tensors="pt")
output = model(**encoded_input)
output
#We observe 3 logits, corresponding to the 3 neurons in the final layer (see out_features in out_proj in printed model). 
#Each logit is the prediction for each of the tokens. 
#These logits are unnormalized

SequenceClassifierOutput(loss=None, logits=tensor([[-2.3958, -0.2310,  3.1936]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [14]:
# Now we want to obtain the logits. We move to post-processing
import numpy as np
logits = output.logits.detach().numpy() 
print (logits)
#pytorch uses a computation graph for gradient calculation. When you call detach, you remove the graph
#when you use .numpy, we are no longer using tensors because we want to do calculations
y_pred = np.argmax(logits) #Highest value of the 3 output neurons
y_pred # This is the index of the max logit
#One can visit https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment
#And see that depending on the index, the output is negative (0), neutral (1) or positive (2)

[[-2.3957672  -0.23097067  3.193643  ]]


2

In [12]:
# Normalize score. We can use softmax to normalize the output logits. Normally dim=-1 is the innermost dimension.
print(output.logits.softmax(dim=-1).tolist())

[[0.0036063247825950384, 0.03142129257321358, 0.9649723768234253]]


In [13]:
#Prediction on GPU. We need to pass both model and input to GPU
model = model.to('cuda') #pass our model to gpu
text = "I enjoyed movie"
encoded_input = tokenizer(text, return_tensors="pt").to('cuda')
output = model(**encoded_input)
logits = output.logits.detach().cpu().numpy()
y_pred = np.argmax(logits)
y_pred

RuntimeError: ignored

In [15]:
#We can do this also with multiple examples.
model = model
text = ["I enjoyed movie", "movie was waste of time"]
encoded_input = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
#Padding and truncation are important, because when you have multiple examples you have different lengths!
#To make sure they have the same length, we use padding: sentence shorter compared to maximum length, add some padding
# If something longer than maximum length, we truncate it
output = model(**encoded_input)
logits = output.logits.detach().cpu().numpy()
print(logits)
y_pred = np.argmax(logits, axis=-1) #We need to specify predictions along a specific axis. Apply argmax along row logits
y_pred

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[[-2.3957672  -0.23097067  3.1936426 ]
 [ 2.9759662  -0.35204977 -2.6223187 ]]


array([2, 0])

In [16]:
#On dimensions and axis:
#https://www.sharpsightlabs.com/blog/numpy-argmax/
#"Here, we’re applying np.argmax along axis-1. Remember: for 2D Numpy arrays, axis-1 points horizontally across the columns."