# Tokenization Benchmark

- Sur la base d'un court texte, ce notebook teste les méthodes de tokenisation les plus courantes issues de librairies ou construites à partir de regex.

In [1]:
sample_text = "C'est 1 test pour mieux <h1> tokenizer <h1> les textes en  #NLP ou #TALN (Natural Language Processing ou traitement automatique du language naturel) sans dépenser 100$ ;-) ! J'espère que l'on trouvera la solution #optimale ou à défaut la meilleure [possible] et aller 120 fois + vite. Depuis le 18/10/2021, je travaille sur les Regex et j'utilise https://towardsdatascience.com/ ou des bit.ly. Merci @fchollet (françois.chollet@google.com) pour le livre sur le Deep Learning & Jens Albrecht, Sidharth Ramachandran, Christian Winkler pour l'excellent ouvrage NLP pic.twitter.com/abc sans oublier Laurence Moroney 😜."

In [13]:
#r'\w+(?:\'\w+){0,2}|[^\w\s]'
print ("Tokens Gensim Token Level")

from gensim.utils import tokenize
gensim_tokens = list (tokenize(sample_text, deacc = True))
print(gensim_tokens)
print("\n")

print ("Tokens NTLK Word_tokenize")
from nltk.tokenize import word_tokenize
tokens = word_tokenize(sample_text)
print(tokens)
print("\n")

print ("Tokens NTLK Word_tokenize French Language")
tokens = word_tokenize(sample_text, language='french')
print(tokens)
print("\n")

print ("Tokens NTLK Pickle French")
import nltk.data
#chargement du tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/french.pickle')
print(tokenizer.tokenize(sample_text))
print("\n")

print ("Tokens Regex WhitespaceTokenizer")
from nltk.tokenize import WhitespaceTokenizer
tokenizer=WhitespaceTokenizer()
print(tokenizer.tokenize(sample_text))
print("\n")

from nltk.tokenize import sent_tokenize
print ("Tokens NTLK Quatro Sentence Level")
print(sent_tokenize(sample_text,language='french'))
print("\n")

print ("Tokens NTLK Quatro Sentence /Token Level")
print([word_tokenize(t, language='french') for t in sent_tokenize(sample_text)])
print("\n")

print ("Tokens NTLK TweetTokenizer")
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()
tokenizer = tweet_tokenizer
print(tokenizer.tokenize(sample_text))
print("\n")
      
print ("Tokens NLTK Sent_tokenize")
from nltk.tokenize import sent_tokenize
tokenizer = sent_tokenize(sample_text)
print(tokenizer)
print("\n")

print ("Tokens WordPunctTokenizer") #OK
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
print(tokenizer.tokenize(sample_text))
print("\n")

print ("Tokens NWET Tokenizer")
from nltk.tokenize import MWETokenizer
tokenizer = MWETokenizer()
tokenizer.add_mwe(('Deep', 'Learning'))
tokenizer.add_mwe(('Natural', 'Language', 'Processing'))
tokenizer.add_mwe(('traitement', 'automatique', "du', 'language"))
print(tokenizer.tokenize(word_tokenize(sample_text)))
print("\n")

import spacy
print ("Tokens Spacy Tokenizer Level")
from spacy.tokenizer import Tokenizer
nlp_fr = spacy.load('fr_core_news_sm')
doc = nlp_fr(sample_text)
tokens = [token.text for token in doc]
print(tokens)
print("\n")
      
print ("Tokens Spacy Sentence Level")
nlp_fr.add_pipe('sentencizer')
doc = nlp_fr(sample_text)
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)
print("\n")

from nltk.tokenize import RegexpTokenizer
print ("Tokens Scikit Learn")
tokenizer_scikit = RegexpTokenizer(r'(?u)\b\w\w+\b')
print(tokenizer_scikit.tokenize(sample_text))
print("\n")

import re
print ("Tokens Regex A")
tokenizer_a = RegexpTokenizer('(\w+|\$[\d\.]+|\S+)')
print(tokenizer_a.tokenize(sample_text))
print("\n")
      
print ("Tokens Regex B")
tokenizer_b = RegexpTokenizer(r'''\w'|\w+|[^\w\s]''')
print(tokenizer_b.tokenize(sample_text))
print("\n")
      
print ("Tokens Regex C")
tokenizer_c = RegexpTokenizer('[a-zA-Z\-][a-zA-Z\-]{2,}')
print(tokenizer_c.tokenize(sample_text))
print("\n")
      
print ("Tokens Regex D")
tokenizer_d = RegexpTokenizer('(\w+|#\d|\?|!)')
print(tokenizer_d.tokenize(sample_text))
print("\n")
      
print ("Tokens Regex E")
espace = r"\s+"
print(re.split(espace, sample_text))
print("\n")

print ("Tokens Regex F")
from nltk.tokenize import regexp_tokenize
pattern ='\w+|\$[\d\.]+|\S+'
print(regexp_tokenize(sample_text, pattern))
print("\n")
            
print ("Tokens Regex H")
tokenizer = RegexpTokenizer( r"(\w+|#\d|\?|!)")
print(tokenizer.tokenize(sample_text))
print("\n")
      
print ("Tokens Regex I")
print(re.findall("[\w']+",sample_text))
print("\n")
      
print ("Tokens Regex J Sentence Level")
sentences = re.compile('[.!?] ').split(sample_text)
print(sentences)
print("\n")
      
print ("Tokens Regex K")
tokenizer = RegexpTokenizer(r'''\w'|\w+|[^\w\s]''')
print(tokenizer.tokenize(sample_text))
print("\n")
         
print ("Tokens TextBlob")
from textblob import TextBlob
print(TextBlob(sample_text).words)
print("\n")
      
print ("Tokens Keras")
from keras.preprocessing.text import text_to_word_sequence
print(text_to_word_sequence(sample_text))
print("\n")

print ("Tokens Keras + paramètre longueur tokens")
from keras.preprocessing.text import Tokenizer
ntoken = Tokenizer(len(sample_text))
ntoken.fit_on_texts(sample_text)
print(text_to_word_sequence(sample_text))
print("\n")

print ("Tokens Keras Tensorflow")
from tensorflow.keras.preprocessing.text import Tokenizer
tokens = text_to_word_sequence(sample_text)
print(tokens)
print("\n")

print ("Tokens Moses")
from mosestokenizer import MosesTokenizer, MosesDetokenizer
tokenizer  = MosesTokenizer()
print(tokenizer(sample_text))
print("\n")

print ("Tokens Regex Bert")
from tensorflow import keras 
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import BertModel, AdamW, get_linear_schedule_with_warmup
from transformers import BertTokenizer, BertConfig,BertForSequenceClassification
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine")
tokens = tokenizer.tokenize(sample_text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f' Sentence: {sample_text}')
print("\n")

print(f'   Tokens: {tokens}')
print("\n")

print(f'Token IDs: {token_ids}')
print("\n")


Tokens Gensim Token Level
['C', 'est', 'test', 'pour', 'mieux', 'h', 'tokenizer', 'h', 'les', 'textes', 'en', 'NLP', 'ou', 'TALN', 'Natural', 'Language', 'Processing', 'ou', 'traitement', 'automatique', 'du', 'language', 'naturel', 'sans', 'depenser', 'J', 'espere', 'que', 'l', 'on', 'trouvera', 'la', 'solution', 'optimale', 'ou', 'a', 'defaut', 'la', 'meilleure', 'possible', 'et', 'aller', 'fois', 'vite', 'Depuis', 'le', 'je', 'travaille', 'sur', 'les', 'Regex', 'et', 'j', 'utilise', 'https', 'towardsdatascience', 'com', 'ou', 'des', 'bit', 'ly', 'Merci', 'fchollet', 'francois', 'chollet', 'google', 'com', 'pour', 'le', 'livre', 'sur', 'le', 'Deep', 'Learning', 'Jens', 'Albrecht', 'Sidharth', 'Ramachandran', 'Christian', 'Winkler', 'pour', 'l', 'excellent', 'ouvrage', 'NLP', 'pic', 'twitter', 'com', 'abc', 'sans', 'oublier', 'Laurence', 'Moroney']


Tokens NTLK Word_tokenize
["C'est", '1', 'test', 'pour', 'mieux', '<', 'h1', '>', 'tokenizer', '<', 'h1', '>', 'les', 'textes', 'en', '#'

stdbuf was not found; communication with perl may hang due to stdio buffering.


["C'", 'est', '1', 'test', 'pour', 'mieux', '<', 'h1', '>', 'tokenizer', '<', 'h1', '>', 'les', 'textes', 'en', ' ', '#', 'NLP', 'ou', '#', 'TALN', '(', 'Natural', 'Language', 'Processing', 'ou', 'traitement', 'automatique', 'du', 'language', 'naturel', ')', 'sans', 'dépenser', '100', '$', ';-)', '!', "J'", 'espère', 'que', "l'", 'on', 'trouvera', 'la', 'solution', '#', 'optimale', 'ou', 'à', 'défaut', 'la', 'meilleure', '[', 'possible', ']', 'et', 'aller', '120', 'fois', '+', 'vite', '.', 'Depuis', 'le', '18/10/2021', ',', 'je', 'travaille', 'sur', 'les', 'Regex', 'et', "j'", 'utilise', 'https://towardsdatascience.com/', 'ou', 'des', 'bit.ly', '.', 'Merci', '@fchollet', '(', 'françois.chollet@google.com', ')', 'pour', 'le', 'livre', 'sur', 'le', 'Deep', 'Learning', '&', 'Jens', 'Albrecht', ',', 'Sidharth', 'Ramachandran', ',', 'Christian', 'Winkler', 'pour', "l'", 'excellent', 'ouvrage', 'NLP', 'pic.twitter.com/abc', 'sans', 'oublier', 'Laurence', 'Moroney', '😜', '.']


Tokens Spacy S