In [None]:
!pip install transformers

In [1]:
from transformers import AutoTokenizer

In [2]:
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint) 

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [3]:
tokenizer
# name, vocabulary size, sequence length, optimized for performance, which side padding/truncation goes on to

PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [4]:
tokenizer('hello world')
# attention mask => which tokens should enter into attention computation 

{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [5]:
tokens = tokenizer.tokenize('hello world')
tokens
# returns words

['hello', 'world']

In [6]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids
# returns integers of the words

[7592, 2088]

In [7]:
tokenizer.convert_ids_to_tokens(ids) 
# back to token list

['hello', 'world']

In [8]:
tokenizer.decode(ids) 
# converts ids + joins tokens to string 

'hello world'

In [9]:
ids = tokenizer.encode("hello world")
ids
# list of token ids with start/stop token

[101, 7592, 2088, 102]

In [10]:
tokenizer.convert_ids_to_tokens(ids) 
# ids => tokens, BERT CLS and SET

['[CLS]', 'hello', 'world', '[SEP]']

In [11]:
tokenizer.decode(ids) 
# true input in bert model 

'[CLS] hello world [SEP]'

In [12]:
model_inputs = tokenizer("hello world")
model_inputs

{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [13]:
data = [
    "I linke cats.", 
    "Do you like cats too?"
]
tokenizer(data)
# multiple sentence tokenization 

{'input_ids': [[101, 1045, 4957, 2063, 8870, 1012, 102], [101, 2079, 2017, 2066, 8870, 2205, 1029, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [14]:
from transformers import AutoModelForSequenceClassification

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint) 
# model head has random weights => has to be trained

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [16]:
outputs = model(**model_inputs) 
# will output error, model takes pytorch tensors (pt)

AttributeError: 'list' object has no attribute 'size'

In [None]:
# inputs formatted as pytorch tensors 
model_inputs = tokenizer("hello world", return_tensors='pt')
model_inputs

In [None]:
# the default was to create a binary classifier 
outputs = model(**model_inputs) 
outputs
# logits are meaningless because head with random weights, framework assume we want binary classifier

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 3) 

In [None]:
outputs = model(**model_inputs) 
outputs
# 3-valued logits 

In [None]:
outputs.logits

In [None]:
outputs['logits']

In [None]:
outputs[0]

In [None]:
# converts logits into numpy array to compute metrics
# remove gradient, move to cpu, convert to numpy array 
outputs.logits.detach().cpu().numpy()

In [None]:
data = [
    "I like cats.", 
    "Do you like cats too?"
]

model_inputs = tokenizer(data, return_tensors='pt')
model_inputs
# returns error

In [None]:
model_inputs = tokenizer(
    data, padding=True, truncation=True, return_tensors='pt'
)
model_inputs

In [None]:
model_inputs['input_ids']

In [None]:
model_inputs['attention_mask']

In [None]:
outputs = model(**model_inputs) 
outputs