## 5.2 The Next Sentence Prediction Task

In [1]:
from transformers import BertForNextSentencePrediction, BertTokenizer
import torch

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_nsp = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

In [3]:
bert_nsp

BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [4]:
text = "Deliver huge improvements to your machine learning pipelines without spending hours fine-tuning parameters!"
text2 = "This book's practical case-studies reveal feature engineering techniques that upgrade your data wrangling-and your ML results."

In [5]:
inputs = tokenizer(text, text2, return_tensors='pt')
inputs

{'input_ids': tensor([[  101,  8116,  4121,  8377,  2000,  2115,  3698,  4083, 13117,  2015,
          2302,  5938,  2847,  2986,  1011, 17372, 11709,   999,   102,  2023,
          2338,  1005,  1055,  6742,  2553,  1011,  2913,  7487,  3444,  3330,
          5461,  2008, 12200,  2115,  2951, 23277,  5654,  2989,  1011,  1998,
          2115, 19875,  3463,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [6]:
inputs.input_ids  # tokens for sentence A & B

tensor([[  101,  8116,  4121,  8377,  2000,  2115,  3698,  4083, 13117,  2015,
          2302,  5938,  2847,  2986,  1011, 17372, 11709,   999,   102,  2023,
          2338,  1005,  1055,  6742,  2553,  1011,  2913,  7487,  3444,  3330,
          5461,  2008, 12200,  2115,  2951, 23277,  5654,  2989,  1011,  1998,
          2115, 19875,  3463,  1012,   102]])

In [7]:
inputs.token_type_ids  # segment IDs (0 == A & 1 == B)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [8]:
inputs.attention_mask  # all ones == pay attention to everything

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [9]:
outputs = bert_nsp(**inputs)
outputs

NextSentencePredictorOutput(loss=None, logits=tensor([[ 6.1279, -5.7186]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [10]:
# calculate loss by passing through a label
outputs = bert_nsp(**inputs, labels=torch.LongTensor([0]))
outputs

NextSentencePredictorOutput(loss=tensor(7.1525e-06, grad_fn=<NllLossBackward0>), logits=tensor([[ 6.1279, -5.7186]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [11]:
# calculate loss by passing through a label
# Tell model that sentence B does not come after sentence A - get loss of 11 (high)
outputs = bert_nsp(**inputs, labels=torch.LongTensor([1]))
outputs

NextSentencePredictorOutput(loss=tensor(11.8466, grad_fn=<NllLossBackward0>), logits=tensor([[ 6.1279, -5.7186]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

## 5.3 Fine-tuning BERT to solve NLP tasks

In [16]:
from transformers import pipeline, BertForQuestionAnswering, BertForTokenClassification, BertForSequenceClassification

In [14]:
# num_labels=2 by default
bert_seq = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
bert_seq

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [15]:
bert_seq.classifier

Linear(in_features=768, out_features=3, bias=True)

In [17]:
# finding a classifier on the HuggingFace model repository
finbert = pipeline('text-classification', model="ProsusAI/finbert", tokenizer="ProsusAI/finbert")

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [18]:
finbert('Stocks rallied and the British pound gained')

[{'label': 'positive', 'score': 0.6949422955513}]

In [19]:
finbert('The stock did ok')

[{'label': 'neutral', 'score': 0.806524395942688}]

In [20]:
finbert.model.classifier

Linear(in_features=768, out_features=3, bias=True)

In [22]:
bert_tc = BertForTokenClassification.from_pretrained('bert-base-uncased')
bert_tc

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [None]:
# https://huggingface.co/savasy/bert-base-turkish-ner-cased
custom_module = 'savasy/bert-base-turkish-ner-cased'
ner = pipeline('ner', model=custom_module, tokenizer=custom_module)

sequence = "Merhaba! Benim adim Sinan. San Francisco'dan geliyorum"  # Hi! I am Sinan. I come from San Francisco
ner(sequence)

In [23]:
bert_qa = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
bert_qa

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [24]:
bert_qa.qa_outputs

Linear(in_features=768, out_features=2, bias=True)

In [None]:
model_name = "deepset/roberta-base-squad2"
qa = pipeline(model=model_name, tokenizer=model_name, revision="v1.0", task="question-answering")

In [None]:
sequence = "Where is Socrates living these days?", "Socrates lives in California but Morris lives in Boston"
qa(*sequence)

In [None]:
sequence = "Where is Matt living these days?", "Socrates lives in California but Morris lives in Boston"
qa(*sequence)