In [7]:
from tokenizers import Tokenizer, normalizers, models

In [8]:
tokenizer = Tokenizer(model=models.BPE(unk_token='[UNK]'))
string = 'This pre-tokenizer splits tokens on spaces, and also on punctuation. Each occurence of a punctuation character will be treated separately.'

### Strip

In [9]:
# Strip normalizer
tokenizer.normalizer = normalizers.Strip(left=True, right=True)
tokenizer.normalizer.normalize_str(' my love is coding!')

'my love is coding!'

### Lowercase

In [10]:
# Lowercase Normalizer
tokenizer.normalizer = normalizers.Lowercase()
tokenizer.normalizer.normalize_str('My love is coding!')

'my love is coding!'

### BertNormalizer

In [17]:
# Takes care of normalizing raw text before giving it to a Bert model. This includes cleaning the text, handling accents, chinese chars and lowercasing
tokenizer.normalizer = normalizers.BertNormalizer(
    #  Whether to clean the text, by removing any control characters and replacing all whitespaces by the classic one.
    clean_text=True,
    # Whether to handle chinese chars by putting spaces around them.
    handle_chinese_chars=True,
    # Whether to strip all accents. If this option is not specified (ie == None), then it will be determined by the value for lowercase (as in the original Bert).
    strip_accents=True,
    # Whether to lowercase.
    lowercase=True)
tokenizer.normalizer.normalize_str(' My Love\nIs 打游戏!')

' my love is  打  游  戏 !'

### Sequence

In [18]:
# Allows concatenating multiple other Normalizer as a Sequence. All the normalizers run in sequence in the given order
tokenizer.normalizer = normalizers.Sequence([normalizers.Strip(),
                                             normalizers.BertNormalizer()])
tokenizer.normalizer.normalize_str(' My Love\nIs 打游戏!')

'my love is  打  游  戏 !'