In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# AutoTokenizer：输入文本的进行分词，并进行转换编码的操作
# AutoModelForSequenceClassification：加载置顶的模型，并将输出[CLS]加上全连接进行文本分类

model_name = "bert-base-chinese" # 可以自己修改

pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
inputs = tokenizer("me")
inputs
# input_ids：
# token_type_ids：
# attention_mask：

{'input_ids': [101, 791, 1921, 1921, 3698, 2523, 1962, 8024, 852, 3209, 1921, 833, 678, 7433, 511, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
pt_batch = tokenizer(
    ["hello", "now"],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

In [13]:
for key, value in pt_batch.items():
    print(f"{key}: {value.numpy().tolist()}")

input_ids: [[101, 791, 1921, 678, 7433, 102, 0, 0, 0, 0], [101, 3209, 1921, 678, 7433, 1400, 1921, 678, 7433, 102]]
token_type_ids: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
attention_mask: [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]


In [15]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

sequence = "A Titan RTX has 24GB of VRAM"
tokenized_sequence = tokenizer.tokenize(sequence)

In [16]:
tokenized_sequence

['A',
 'Titan',
 'R',
 '##T',
 '##X',
 'has',
 '24',
 '##GB',
 'of',
 'V',
 '##RA',
 '##M']

In [17]:
inputs = tokenizer(sequence)

In [18]:
inputs

{'input_ids': [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [19]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

sequence_a = "This is a short sequence."
sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."

encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
encoded_sequence_b = tokenizer(sequence_b)["input_ids"]

In [20]:
encoded_sequence_a

[101, 1188, 1110, 170, 1603, 4954, 119, 102]

In [4]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
sequence_a = "HuggingFace is based in NYC. So my"
sequence_b = "Where is HuggingFace based?"

encoded_dict = tokenizer(sequence_a)
decoded = tokenizer.decode(encoded_dict["input_ids"])

In [5]:
decoded

'[CLS] HuggingFace is based in NYC. So my [SEP]'

In [6]:
encoded_dict

{'input_ids': [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 119, 1573, 1139, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}