In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer

In [None]:
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint) 

In [None]:
tokenizer
# name, vocabulary size, sequence length, optimized for performance, which side padding/truncation goes on to

In [None]:
tokenizer('hello world')
# attention mask => which tokens should enter into attention computation 

In [None]:
tokens = tokenizer.tokenize('hello world')
tokens
# returns words

In [None]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids
# returns integers of the words

In [None]:
tokenizer.convert_ids_to_tokens(ids) 
# back to token list

In [None]:
tokenizer.decode(ids) 
# converts ids + joins tokens to string 

In [None]:
ids = tokenizer.encode("hello world")
ids
# list of token ids with start/stop token

In [None]:
tokenizer.convert_ids_to_tokens(ids) 
# ids => tokens, BERT CLS and SET

In [None]:
tokenizer.decode(ids) 
# true input in bert model 

In [None]:
model_inputs = tokenizer("hello world")
model_inputs

In [None]:
data = [
    "I linke cats.", 
    "Do you like cats too?"
]
tokenizer(data)
# multiple sentence tokenization 

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint) 
# model head has random weights => has to be trained

In [None]:
outputs = model(**model_inputs) 
# will output error, model takes pytorch tensors (pt)

In [None]:
# inputs formatted as pytorch tensors 
model_inputs = tokenizer("hello world", return_tensors='pt')
model_inputs

In [None]:
# the default was to create a binary classifier 
outputs = model(**model_inputs) 
outputs
# logits are meaningless because head with random weights, framework assume we want binary classifier

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 3) 

In [None]:
outputs = model(**model_inputs) 
outputs
# 3-valued logits 

In [None]:
outputs.logits

In [None]:
outputs['logits']

In [None]:
outputs[0]

In [None]:
# converts logits into numpy array to compute metrics
# remove gradient, move to cpu, convert to numpy array 
outputs.logits.detach().cpu().numpy()

In [None]:
data = [
    "I like cats.", 
    "Do you like cats too?"
]

model_inputs = tokenizer(data, return_tensors='pt')
model_inputs
# returns error

In [None]:
model_inputs = tokenizer(
    data, padding=True, truncation=True, return_tensors='pt'
)
model_inputs

In [None]:
model_inputs['input_ids']

In [None]:
model_inputs['attention_mask']

In [None]:
outputs = model(**model_inputs) 
outputs