In [1]:
from transformers import AutoTokenizer

In [5]:
# Récupération du tokenizer pré-entraîné BERT
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

In [22]:
# Texte à prétraiter par le tokenizer
texte = "Can I find information about SALOME platform?"

## Utilisation globale pour la tokenization

In [20]:
# Utilisation globale du tokenizer
print(tokenizer(texte))

{'input_ids': [101, 2064, 1045, 2424, 2592, 2055, 9130, 4132, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


## Séparation des étapes pour la tokenization

In [21]:
# Division du texte en tokens
tokens = tokenizer.tokenize(texte)
print(tokens)

['can', 'i', 'find', 'information', 'about', 'facebook', 'platform', '?']


In [6]:
# Association des tokens à leur ID respectif (définis par le vocabulaire du tokenizer)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[2064, 1045, 2424, 2592, 2055, 16183, 8462, 4132, 1029]


In [7]:
# Ajout des IDs des tokens spéciaux
input_ids = tokenizer.prepare_for_model(ids)
print(input_ids)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': [101, 2064, 1045, 2424, 2592, 2055, 16183, 8462, 4132, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [8]:
# Affichage des tokens (avec les tokens spéciaux)
print(tokenizer.tokenize(tokenizer.decode(input_ids["input_ids"])))

['[CLS]', 'can', 'i', 'find', 'information', 'about', 'sal', '##ome', 'platform', '?', '[SEP]']


## Décodage avec le tokenizer

In [9]:
tokenizer.decode(input_ids["input_ids"])

'[CLS] can i find information about salome platform? [SEP]'

## Prétraitement de plusieurs textes

In [10]:
batch_sentences = [
    "Can I find information about SALOME platform?",
    "Where is located CEA Research Center?",
    "Is it good?"
]

In [11]:
encoded_inputs = tokenizer(batch_sentences)
print(encoded_inputs)

{'input_ids': [[101, 2064, 1045, 2424, 2592, 2055, 16183, 8462, 4132, 1029, 102], [101, 2073, 2003, 2284, 8292, 2050, 2470, 2415, 1029, 102], [101, 2003, 2009, 2204, 1029, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}


## Padding

**Padding** is a strategy for ensuring tensors are rectangular by adding a special padding token to shorter sentences :

In [12]:
encoded_inputs = tokenizer(batch_sentences, padding=True)

# Affichage des inputs finaux de chaque phrase
print(encoded_inputs)

print("\n")

# Affichage des tokens avec le padding
print(tokenizer.tokenize(tokenizer.decode(encoded_inputs["input_ids"][0])))
print(tokenizer.tokenize(tokenizer.decode(encoded_inputs["input_ids"][1])))
print(tokenizer.tokenize(tokenizer.decode(encoded_inputs["input_ids"][2])))

{'input_ids': [[101, 2064, 1045, 2424, 2592, 2055, 16183, 8462, 4132, 1029, 102], [101, 2073, 2003, 2284, 8292, 2050, 2470, 2415, 1029, 102, 0], [101, 2003, 2009, 2204, 1029, 102, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]}


['[CLS]', 'can', 'i', 'find', 'information', 'about', 'sal', '##ome', 'platform', '?', '[SEP]']
['[CLS]', 'where', 'is', 'located', 'ce', '##a', 'research', 'center', '?', '[SEP]', '[PAD]']
['[CLS]', 'is', 'it', 'good', '?', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


## Truncation

With **truncation**, we can truncate a sequence to the maximum length accepted by the model (if `max_length` not use), or to the maximum length `max_length` :

In [13]:
encoded_inputs = tokenizer(batch_sentences, padding=True, max_length=5, truncation=True)

# Affichage des inputs finaux de chaque phrase
print(encoded_inputs)

print("\n")

# Affichage des tokens avec le padding et la troncation
print(tokenizer.tokenize(tokenizer.decode(encoded_inputs["input_ids"][0])))
print(tokenizer.tokenize(tokenizer.decode(encoded_inputs["input_ids"][1])))
print(tokenizer.tokenize(tokenizer.decode(encoded_inputs["input_ids"][2])))

{'input_ids': [[101, 2064, 1045, 2424, 102], [101, 2073, 2003, 2284, 102], [101, 2003, 2009, 2204, 102]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}


['[CLS]', 'can', 'i', 'find', '[SEP]']
['[CLS]', 'where', 'is', 'located', '[SEP]']
['[CLS]', 'is', 'it', 'good', '[SEP]']


## Construction des tenseurs

Pour pouvoir utiliser ces inputs dans un modèle, il faut les placer dans un tenseur. Pour cela, on utilise l'argument `return_tensors` :

In [14]:
encoded_inputs = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt") # "pt" pour PyTorch

# Affichage des inputs finaux de chaque phrase
print(encoded_inputs)

{'input_ids': tensor([[  101,  2064,  1045,  2424,  2592,  2055, 16183,  8462,  4132,  1029,
           102],
        [  101,  2073,  2003,  2284,  8292,  2050,  2470,  2415,  1029,   102,
             0],
        [  101,  2003,  2009,  2204,  1029,   102,     0,     0,     0,     0,
             0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}


## Décodage avec le tokenizer pour un batch

In [16]:
# Affichage des tokens avec le padding avec batch_decode
print(tokenizer.batch_decode(encoded_inputs["input_ids"]))

['[CLS] can i find information about salome platform? [SEP]', '[CLS] where is located cea research center? [SEP] [PAD]', '[CLS] is it good? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]']
