In [1]:
%pip install torch transformers

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a Hugging Face course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
from datasets import load_dataset

raw_datasets = load_dataset('glue', 'mrpc')
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [4]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [5]:
raw_train_dataset.features

{'sentence1': Value('string'),
 'sentence2': Value('string'),
 'label': ClassLabel(names=['not_equivalent', 'equivalent']),
 'idx': Value('int32')}

In [6]:
print('sentence1:', raw_train_dataset[15]['sentence1'])
print('sentence2:', raw_train_dataset[15]['sentence2'])

print('label:', raw_train_dataset.features['label'].names[raw_train_dataset[15]['label']])

sentence1: Rudder was most recently senior vice president for the Developer & Platform Evangelism Business .
sentence2: Senior Vice President Eric Rudder , formerly head of the Developer and Platform Evangelism unit , will lead the new entity .
label: not_equivalent


In [7]:
print('sentence1:', raw_train_dataset[15]['sentence1'])
print('sentence2:', raw_train_dataset[15]['sentence2'])

print('label:', raw_train_dataset.features['label'].names[raw_train_dataset[87]['label']])

sentence1: Rudder was most recently senior vice president for the Developer & Platform Evangelism Business .
sentence2: Senior Vice President Eric Rudder , formerly head of the Developer and Platform Evangelism unit , will lead the new entity .
label: equivalent


In [8]:
from transformers import AutoTokenizer

checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets['train'][0]['sentence1'])
tokenized_sentences_2 = tokenizer(raw_datasets['train'][0]['sentence2'])

In [9]:
inputs = tokenizer('This is the first sentence.', 'This is the second one.')
inputs

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
inputs_15 = tokenizer(raw_train_dataset[15]['sentence1'])
inputs_15

{'input_ids': [101, 24049, 2001, 2087, 3728, 3026, 3580, 2343, 2005, 1996, 9722, 1004, 4132, 9340, 12439, 2964, 2449, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
tokenizer.decode(inputs_15['input_ids'])

'[CLS] rudder was most recently senior vice president for the developer & platform evangelism business. [SEP]'

In [12]:
inputs_15 = tokenizer(raw_train_dataset[15]['sentence2'])
inputs_15

{'input_ids': [101, 3026, 3580, 2343, 4388, 24049, 1010, 3839, 2132, 1997, 1996, 9722, 1998, 4132, 9340, 12439, 2964, 3131, 1010, 2097, 2599, 1996, 2047, 9178, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [13]:
tokenizer.decode(inputs_15['input_ids'])

'[CLS] senior vice president eric rudder, formerly head of the developer and platform evangelism unit, will lead the new entity. [SEP]'

In [14]:
inputs_15 = tokenizer(raw_train_dataset[15]['sentence1'], raw_train_dataset[15]['sentence2'])
inputs_15

{'input_ids': [101, 24049, 2001, 2087, 3728, 3026, 3580, 2343, 2005, 1996, 9722, 1004, 4132, 9340, 12439, 2964, 2449, 1012, 102, 3026, 3580, 2343, 4388, 24049, 1010, 3839, 2132, 1997, 1996, 9722, 1998, 4132, 9340, 12439, 2964, 3131, 1010, 2097, 2599, 1996, 2047, 9178, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
tokenizer.decode(inputs_15['input_ids'])

'[CLS] rudder was most recently senior vice president for the developer & platform evangelism business. [SEP] senior vice president eric rudder, formerly head of the developer and platform evangelism unit, will lead the new entity. [SEP]'

In [16]:
tokenizer.convert_ids_to_tokens(inputs['input_ids'])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '.',
 '[SEP]']

In [17]:
print('token_type_id: token')
for token, token_type_id in zip(tokenizer.convert_ids_to_tokens(inputs['input_ids']), inputs['token_type_ids']):
    print(f'{token_type_id}: {token}')

token_type_id: token
0: [CLS]
0: this
0: is
0: the
0: first
0: sentence
0: .
0: [SEP]
1: this
1: is
1: the
1: second
1: one
1: .
1: [SEP]


In [59]:
tokenized_datasets = tokenizer(
    raw_datasets['train'][0]['sentence1'],
    raw_datasets['train'][0]['sentence2'],
    padding=True,
    truncation=True,
)
tokenized_datasets

{'input_ids': [101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 7727, 2000, 2032, 2004, 2069, 1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2010, 2567, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [21]:
def tokenize_function(example):
    return tokenizer(example['sentence1'], example['sentence2'], truncation=True)

In [24]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map: 100%|██████████| 3668/3668 [00:00<00:00, 28278.63 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [27]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [26]:
samples = tokenized_datasets['train'][:8]
samples = { k: v for k, v in samples.items() if k not in ['idx', 'sentence1', 'sentence2']}
[len(x) for x in samples['input_ids']]

[50, 59, 47, 67, 59, 50, 62, 32]

In [28]:
batch = data_collator(samples)
{ k: v.shape for k, v in batch.items() }

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

GLUE SST-2 dataset

In [60]:
raw_datasets = load_dataset('glue', 'sst2')
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [53]:
raw_datasets['train'][0:3]

{'sentence': ['hide new secretions from the parental units ',
  'contains no wit , only labored gags ',
  'that loves its characters and communicates something rather beautiful about human nature '],
 'label': [0, 0, 1],
 'idx': [0, 1, 2]}

In [55]:
def single_tokenize_function(example):
    return tokenizer(example['sentence'], truncation=True)

In [61]:
tokenized_datasets = raw_datasets.map(single_tokenize_function, batched=True)
tokenized_datasets

Map: 100%|██████████| 67349/67349 [00:00<00:00, 71528.69 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [46]:
def single_token_length_printer(samples):
    print([len(x) for x in samples['input_ids']])

In [63]:
samples = tokenized_datasets['train'][:3]
samples = { k: v for k, v in samples.items() if k not in ['idx', 'sentence']}
samples

{'label': [0, 0, 1],
 'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102],
  [101, 3397, 2053, 15966, 1010, 2069, 4450, 2098, 18201, 2015, 102],
  [101,
   2008,
   7459,
   2049,
   3494,
   1998,
   10639,
   2015,
   2242,
   2738,
   3376,
   2055,
   2529,
   3267,
   102]],
 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [64]:
single_token_length_printer(samples)

[10, 11, 15]


In [42]:
def batch_printer(batch):
    print({ k: v.shape for k, v in batch.items() })

In [66]:
batch = data_collator(samples)
batch_printer(batch)

{'input_ids': torch.Size([3, 15]), 'token_type_ids': torch.Size([3, 15]), 'attention_mask': torch.Size([3, 15]), 'labels': torch.Size([3])}
