In [75]:
from transformers import AutoTokenizer, AutoModel

In [18]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [99]:
sequence1 = "I've been waiting for a HuggingFace course my whole life."
sequence2 = "So have I!"

In [100]:
prep_inputs = tokenizer(sequence1)

In [101]:
prep_inputs

{'input_ids': [101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [102]:
sequences = [sequence1, sequence2]

In [103]:
prep_inputs = tokenizer(sequences, padding=True)

In [104]:
prep_inputs

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 2061, 2031, 1045, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}

# Padding options

In [105]:
# Pad up to maximum sequence length
prep_inputs = tokenizer(sequences, padding="longest")

# Pad up to model max length
prep_inputs = tokenizer(sequences, padding="max_length")

# Pad up to specified length
prep_inputs = tokenizer(sequences, padding="max_length", max_length=32)

# Truncation

In [106]:
# Truncate if less than model max length
prep_inputs = tokenizer(sequences, truncation=True)

# Truncate if longer than specified max length
prep_inputs = tokenizer(sequences, truncation=True, max_length=8)

In [107]:
prep_inputs

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 102], [101, 2061, 2031, 1045, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

# Return tensors

In [108]:
prep_inputs = tokenizer(sequences, padding=True, return_tensors="pt")
# Won't run if not installed
# prep_inputs = tokenizer(sequences, padding=True, return_tensors="tf")
prep_inputs = tokenizer(sequences, padding=True, return_tensors="np")

# Special tokens

In [109]:
prep_inputs = tokenizer(sequence1)
tokens = tokenizer.tokenize(sequence1)
ids = tokenizer.convert_tokens_to_ids(tokens)

In [110]:
print(prep_inputs.input_ids)
print(ids)

[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]
[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]


In [111]:
print(tokenizer.decode(prep_inputs.input_ids))
print(tokenizer.decode(ids))

[CLS] i've been waiting for a huggingface course my whole life. [SEP]
i've been waiting for a huggingface course my whole life.


In [112]:
model = AutoModel.from_pretrained(checkpoint)
prep_inputs = tokenizer(sequences, padding=True, return_tensors="pt")

In [113]:
res = model(**prep_inputs)

In [114]:
res[0].shape

torch.Size([2, 16, 768])