In [1]:
compare_list = ['https://t.co/9z2J3P33Uc',
               'laugh/cry',
               '😬😭😓🤢🙄😱',
               "world's problems",
               "@datageneral",
                "It's interesting",
               "don't spell my name right",
               'all-nighter']

In [2]:
#WORD_TOKENIZE separate words using spaces and punctuations.

from nltk.tokenize import word_tokenize
word_tokens = []
for sent in compare_list:
    print(word_tokenize(sent))
    word_tokens.append(word_tokenize(sent))

['https', ':', '//t.co/9z2J3P33Uc']
['laugh/cry']
['😬😭😓🤢🙄😱']
['world', "'s", 'problems']
['@', 'datageneral']
['It', "'s", 'interesting']
['do', "n't", 'spell', 'my', 'name', 'right']
['all-nighter']


In [3]:
#WORDPUNCTTOKENIZER splits all punctuations into separate tokens. 

from nltk.tokenize import WordPunctTokenizer
punct_tokenizer = WordPunctTokenizer()
punct_tokens = []
for sent in compare_list:
    print(punct_tokenizer.tokenize(sent))
    punct_tokens.append(punct_tokenizer.tokenize(sent))

['https', '://', 't', '.', 'co', '/', '9z2J3P33Uc']
['laugh', '/', 'cry']
['😬😭😓🤢🙄😱']
['world', "'", 's', 'problems']
['@', 'datageneral']
['It', "'", 's', 'interesting']
['don', "'", 't', 'spell', 'my', 'name', 'right']
['all', '-', 'nighter']


In [4]:
#REGEXPTOKENIZER control how to tokenize text. \w+ matches one or more word character (alphanumeric & underscore)

from nltk.tokenize import RegexpTokenizer
match_tokenizer = RegexpTokenizer("[\w']+")
match_tokens = []
for sent in compare_list:   
    print(match_tokenizer.tokenize(sent))
    match_tokens.append(match_tokenizer.tokenize(sent))

['https', 't', 'co', '9z2J3P33Uc']
['laugh', 'cry']
[]
["world's", 'problems']
['datageneral']
["It's", 'interesting']
["don't", 'spell', 'my', 'name', 'right']
['all', 'nighter']


In [5]:
'''RegexpTokenizer can also work by matching the gaps. When the parameter gaps=True is added, 
the matching pattern will be used as the separators. \s+ matches one or more space.'''

space_tokenizer = RegexpTokenizer("\s+", gaps=True)
space_tokens = []
for sent in compare_list:
    print(space_tokenizer.tokenize(sent))
    space_tokens.append(space_tokenizer.tokenize(sent))

['https://t.co/9z2J3P33Uc']
['laugh/cry']
['😬😭😓🤢🙄😱']
["world's", 'problems']
['@datageneral']
["It's", 'interesting']
["don't", 'spell', 'my', 'name', 'right']
['all-nighter']


In [6]:
#TWEETTOKENIZER the best way to tokenize tweets is to use the tokenizer built to tokenize tweets
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()
tweet_tokens = []
for sent in compare_list:
    print(tweet_tokenizer.tokenize(sent))
    tweet_tokens.append(tweet_tokenizer.tokenize(sent))

['https://t.co/9z2J3P33Uc']
['laugh', '/', 'cry']
['😬', '😭', '😓', '🤢', '🙄', '😱']
["world's", 'problems']
['@datageneral']
["It's", 'interesting']
["don't", 'spell', 'my', 'name', 'right']
['all-nighter']


In [7]:
'''Put Everything Together
Instead of taking the time to analyze the outcome of each tokenizer,
we can put everything in one pd.dataframe for fast and accurate interpretation.'''
import pandas as pd
tokenizers = {'word_tokenize': word_tokens,
             'WordPunctTokenize':punct_tokens,
             'RegrexTokenizer for matching':match_tokens,
             'RegrexTokenizer for white space': space_tokens,
             'TweetTokenizer': tweet_tokens }
df = pd.DataFrame.from_dict(tokenizers)

In [8]:
display(df)


Unnamed: 0,word_tokenize,WordPunctTokenize,RegrexTokenizer for matching,RegrexTokenizer for white space,TweetTokenizer
0,"[https, :, //t.co/9z2J3P33Uc]","[https, ://, t, ., co, /, 9z2J3P33Uc]","[https, t, co, 9z2J3P33Uc]",[https://t.co/9z2J3P33Uc],[https://t.co/9z2J3P33Uc]
1,[laugh/cry],"[laugh, /, cry]","[laugh, cry]",[laugh/cry],"[laugh, /, cry]"
2,[😬😭😓🤢🙄😱],[😬😭😓🤢🙄😱],[],[😬😭😓🤢🙄😱],"[😬, 😭, 😓, 🤢, 🙄, 😱]"
3,"[world, 's, problems]","[world, ', s, problems]","[world's, problems]","[world's, problems]","[world's, problems]"
4,"[@, datageneral]","[@, datageneral]",[datageneral],[@datageneral],[@datageneral]
5,"[It, 's, interesting]","[It, ', s, interesting]","[It's, interesting]","[It's, interesting]","[It's, interesting]"
6,"[do, n't, spell, my, name, right]","[don, ', t, spell, my, name, right]","[don't, spell, my, name, right]","[don't, spell, my, name, right]","[don't, spell, my, name, right]"
7,[all-nighter],"[all, -, nighter]","[all, nighter]",[all-nighter],[all-nighter]


In [9]:
df

Unnamed: 0,word_tokenize,WordPunctTokenize,RegrexTokenizer for matching,RegrexTokenizer for white space,TweetTokenizer
0,"[https, :, //t.co/9z2J3P33Uc]","[https, ://, t, ., co, /, 9z2J3P33Uc]","[https, t, co, 9z2J3P33Uc]",[https://t.co/9z2J3P33Uc],[https://t.co/9z2J3P33Uc]
1,[laugh/cry],"[laugh, /, cry]","[laugh, cry]",[laugh/cry],"[laugh, /, cry]"
2,[😬😭😓🤢🙄😱],[😬😭😓🤢🙄😱],[],[😬😭😓🤢🙄😱],"[😬, 😭, 😓, 🤢, 🙄, 😱]"
3,"[world, 's, problems]","[world, ', s, problems]","[world's, problems]","[world's, problems]","[world's, problems]"
4,"[@, datageneral]","[@, datageneral]",[datageneral],[@datageneral],[@datageneral]
5,"[It, 's, interesting]","[It, ', s, interesting]","[It's, interesting]","[It's, interesting]","[It's, interesting]"
6,"[do, n't, spell, my, name, right]","[don, ', t, spell, my, name, right]","[don't, spell, my, name, right]","[don't, spell, my, name, right]","[don't, spell, my, name, right]"
7,[all-nighter],"[all, -, nighter]","[all, nighter]",[all-nighter],[all-nighter]
