This notebook outlines several methods for tokenizing text into words (and sentences), including:

* whitespace
* nltk (Penn Treebank tokenizer)
* nltk (Twitter-aware)
* spaCy
* custom regular expressions

highlighting differences between them.

In [1]:
import nltk, re, json
import spacy
from collections import Counter

In [2]:
# spaCy lemmatization needs tagger but disable the rest
nlp = spacy.load('en', disable=['tagger,ner,parser'])
nlp.remove_pipe('tagger')
nlp.remove_pipe('ner')
nlp.remove_pipe('parser');

In [3]:
def read_tweets_from_json(filename):
    tweets=[]
    with open(filename, encoding="utf-8") as file:
        data=json.load(file)
        for tweet in data:
            tweets.append(tweet["text"])
    return tweets        

trump_tweets.json comes from the Trump Twitter collection here (downloaded 1/19/19)
http://www.trumptwitterarchive.com/archive

In [4]:
filename="../data/trump_tweets.json"

In [5]:
tweets=read_tweets_from_json(filename)

In [6]:
tweets[:5]

['Mexico is doing NOTHING to stop the Caravan which is now fully formed and heading to the United States. We stopped the last two - many are still in Mexico but can’t get through our Wall, but it takes a lot of Border Agents if there is no Wall. Not easy!',
 'Many people are saying that the Mainstream Media will have a very hard time restoring credibility because of the way they have treated me over the past 3 years (including the election lead-up), as highlighted by the disgraceful Buzzfeed story &amp; the even more disgraceful coverage!',
 'The Economy is one of the best in our history, with unemployment at a 50 year low, and the Stock Market ready to again break a record (set by us many times) - &amp; all you heard yesterday, based on a phony story, was Impeachment. You want to see a Stock Market Crash, Impeach Trump!',
 '.@newtgingrich just stated that there has been no president since Abraham Lincoln who has been treated worse or more unfairly by the media than your favorite Presi

In [7]:
whitespace_tokens=[]
for tweet in tweets:
    whitespace_tokens.append(tweet.split())

In [11]:
whitespace_tokens[:5] #punctuations attached to the words (including can't)
                     #except for - because it is space wrapped 

[['Mexico',
  'is',
  'doing',
  'NOTHING',
  'to',
  'stop',
  'the',
  'Caravan',
  'which',
  'is',
  'now',
  'fully',
  'formed',
  'and',
  'heading',
  'to',
  'the',
  'United',
  'States.',
  'We',
  'stopped',
  'the',
  'last',
  'two',
  '-',
  'many',
  'are',
  'still',
  'in',
  'Mexico',
  'but',
  'can’t',
  'get',
  'through',
  'our',
  'Wall,',
  'but',
  'it',
  'takes',
  'a',
  'lot',
  'of',
  'Border',
  'Agents',
  'if',
  'there',
  'is',
  'no',
  'Wall.',
  'Not',
  'easy!'],
 ['Many',
  'people',
  'are',
  'saying',
  'that',
  'the',
  'Mainstream',
  'Media',
  'will',
  'have',
  'a',
  'very',
  'hard',
  'time',
  'restoring',
  'credibility',
  'because',
  'of',
  'the',
  'way',
  'they',
  'have',
  'treated',
  'me',
  'over',
  'the',
  'past',
  '3',
  'years',
  '(including',
  'the',
  'election',
  'lead-up),',
  'as',
  'highlighted',
  'by',
  'the',
  'disgraceful',
  'Buzzfeed',
  'story',
  '&amp;',
  'the',
  'even',
  'more',
  'disg

In [8]:
nltk_tokens=[]
for tweet in tweets:
    nltk_tokens.append(nltk.word_tokenize(tweet, language="english"))

In [10]:
nltk_tokens[:3] ## each puncutaion is considered a word

[['Mexico',
  'is',
  'doing',
  'NOTHING',
  'to',
  'stop',
  'the',
  'Caravan',
  'which',
  'is',
  'now',
  'fully',
  'formed',
  'and',
  'heading',
  'to',
  'the',
  'United',
  'States',
  '.',
  'We',
  'stopped',
  'the',
  'last',
  'two',
  '-',
  'many',
  'are',
  'still',
  'in',
  'Mexico',
  'but',
  'can',
  '’',
  't',
  'get',
  'through',
  'our',
  'Wall',
  ',',
  'but',
  'it',
  'takes',
  'a',
  'lot',
  'of',
  'Border',
  'Agents',
  'if',
  'there',
  'is',
  'no',
  'Wall',
  '.',
  'Not',
  'easy',
  '!'],
 ['Many',
  'people',
  'are',
  'saying',
  'that',
  'the',
  'Mainstream',
  'Media',
  'will',
  'have',
  'a',
  'very',
  'hard',
  'time',
  'restoring',
  'credibility',
  'because',
  'of',
  'the',
  'way',
  'they',
  'have',
  'treated',
  'me',
  'over',
  'the',
  'past',
  '3',
  'years',
  '(',
  'including',
  'the',
  'election',
  'lead-up',
  ')',
  ',',
  'as',
  'highlighted',
  'by',
  'the',
  'disgraceful',
  'Buzzfeed',
  's

In [9]:
nltk_casual_tokens=[]
for tweet in tweets:
    nltk_casual_tokens.append(nltk.casual_tokenize(tweet))

In [13]:
nltk_casual_tokens[:3] ## TODO: difference ???

[['Mexico',
  'is',
  'doing',
  'NOTHING',
  'to',
  'stop',
  'the',
  'Caravan',
  'which',
  'is',
  'now',
  'fully',
  'formed',
  'and',
  'heading',
  'to',
  'the',
  'United',
  'States',
  '.',
  'We',
  'stopped',
  'the',
  'last',
  'two',
  '-',
  'many',
  'are',
  'still',
  'in',
  'Mexico',
  'but',
  'can',
  '’',
  't',
  'get',
  'through',
  'our',
  'Wall',
  ',',
  'but',
  'it',
  'takes',
  'a',
  'lot',
  'of',
  'Border',
  'Agents',
  'if',
  'there',
  'is',
  'no',
  'Wall',
  '.',
  'Not',
  'easy',
  '!'],
 ['Many',
  'people',
  'are',
  'saying',
  'that',
  'the',
  'Mainstream',
  'Media',
  'will',
  'have',
  'a',
  'very',
  'hard',
  'time',
  'restoring',
  'credibility',
  'because',
  'of',
  'the',
  'way',
  'they',
  'have',
  'treated',
  'me',
  'over',
  'the',
  'past',
  '3',
  'years',
  '(',
  'including',
  'the',
  'election',
  'lead-up',
  ')',
  ',',
  'as',
  'highlighted',
  'by',
  'the',
  'disgraceful',
  'Buzzfeed',
  's

In [10]:
spacy_tokens=[]
for tweet in tweets:
    spacy_tokens.append([token.text for token in nlp(tweet)])

In [15]:
spacy_tokens[:2] #ca n't ???

[['Mexico',
  'is',
  'doing',
  'NOTHING',
  'to',
  'stop',
  'the',
  'Caravan',
  'which',
  'is',
  'now',
  'fully',
  'formed',
  'and',
  'heading',
  'to',
  'the',
  'United',
  'States',
  '.',
  'We',
  'stopped',
  'the',
  'last',
  'two',
  '-',
  'many',
  'are',
  'still',
  'in',
  'Mexico',
  'but',
  'ca',
  'n’t',
  'get',
  'through',
  'our',
  'Wall',
  ',',
  'but',
  'it',
  'takes',
  'a',
  'lot',
  'of',
  'Border',
  'Agents',
  'if',
  'there',
  'is',
  'no',
  'Wall',
  '.',
  'Not',
  'easy',
  '!'],
 ['Many',
  'people',
  'are',
  'saying',
  'that',
  'the',
  'Mainstream',
  'Media',
  'will',
  'have',
  'a',
  'very',
  'hard',
  'time',
  'restoring',
  'credibility',
  'because',
  'of',
  'the',
  'way',
  'they',
  'have',
  'treated',
  'me',
  'over',
  'the',
  'past',
  '3',
  'years',
  '(',
  'including',
  'the',
  'election',
  'lead',
  '-',
  'up',
  ')',
  ',',
  'as',
  'highlighted',
  'by',
  'the',
  'disgraceful',
  'Buzzfeed'

In [11]:
# Shorter version of http://sentiment.christopherpotts.net/code-data/happyfuntokenizing.py

# The order here is important (match from first to last)

# Keep usernames together (any token starting with @, followed by A-Z, a-z, 0-9)
regexes=(r"(?:@[\w_]+)",

# Keep hashtags together (any token starting with #, followed by A-Z, a-z, 0-9, _, or -)
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",

# Keep words with apostrophes, hyphens and underscores together
r"(?:[a-z][a-z’'\-_]+[a-z])",

# Keep all other sequences of A-Z, a-z, 0-9, _ together
r"(?:[\w_]+)",

# Everything else that's not whitespace
r"(?:\S)"
)

big_regex="|".join(regexes)

my_extensible_tokenizer = re.compile(big_regex, re.VERBOSE | re.I | re.UNICODE)

def my_extensible_tokenize(text):
    return my_extensible_tokenizer.findall(text)

In [12]:
extensible_tokens=[]
for tweet in tweets:
    extensible_tokens.append(my_extensible_tokenize(tweet))

In [18]:
extensible_tokens[:2]

[['Mexico',
  'is',
  'doing',
  'NOTHING',
  'to',
  'stop',
  'the',
  'Caravan',
  'which',
  'is',
  'now',
  'fully',
  'formed',
  'and',
  'heading',
  'to',
  'the',
  'United',
  'States',
  '.',
  'We',
  'stopped',
  'the',
  'last',
  'two',
  '-',
  'many',
  'are',
  'still',
  'in',
  'Mexico',
  'but',
  'can’t',
  'get',
  'through',
  'our',
  'Wall',
  ',',
  'but',
  'it',
  'takes',
  'a',
  'lot',
  'of',
  'Border',
  'Agents',
  'if',
  'there',
  'is',
  'no',
  'Wall',
  '.',
  'Not',
  'easy',
  '!'],
 ['Many',
  'people',
  'are',
  'saying',
  'that',
  'the',
  'Mainstream',
  'Media',
  'will',
  'have',
  'a',
  'very',
  'hard',
  'time',
  'restoring',
  'credibility',
  'because',
  'of',
  'the',
  'way',
  'they',
  'have',
  'treated',
  'me',
  'over',
  'the',
  'past',
  '3',
  'years',
  '(',
  'including',
  'the',
  'election',
  'lead-up',
  ')',
  ',',
  'as',
  'highlighted',
  'by',
  'the',
  'disgraceful',
  'Buzzfeed',
  'story',
  '&'

Q1: Write a function to print out the first 5 tokenized tweets in each of the five tokenizers above. Examine those tweets; how would you characterize the differences?



In [25]:
def all_tokenized_tweets(corpus, first_k= 5):
    ##
    whitespace_tk = []
    nltk_tk = []
    nltk_casual_tk = []
    spacy_tk = []
    extensible_tk = []
    for tweet in corpus[:first_k]:
        whitespace_tk.append(tweet.split())
        nltk_tk.append(nltk.word_tokenize(tweet, language="english"))
        nltk_casual_tk.append(nltk.casual_tokenize(tweet))
        spacy_tk.append([token.text for token in nlp(tweet)])
        extensible_tk.append(my_extensible_tokenize(tweet))
    print(whitespace_tk)
    print(nltk_tk)
    print(nltk_casual_tk)
    print(spacy_tk)
    print(extensible_tk)
    return

In [27]:
all_tokenized_tweets(tweets)

[['Mexico', 'is', 'doing', 'NOTHING', 'to', 'stop', 'the', 'Caravan', 'which', 'is', 'now', 'fully', 'formed', 'and', 'heading', 'to', 'the', 'United', 'States.', 'We', 'stopped', 'the', 'last', 'two', '-', 'many', 'are', 'still', 'in', 'Mexico', 'but', 'can’t', 'get', 'through', 'our', 'Wall,', 'but', 'it', 'takes', 'a', 'lot', 'of', 'Border', 'Agents', 'if', 'there', 'is', 'no', 'Wall.', 'Not', 'easy!'], ['Many', 'people', 'are', 'saying', 'that', 'the', 'Mainstream', 'Media', 'will', 'have', 'a', 'very', 'hard', 'time', 'restoring', 'credibility', 'because', 'of', 'the', 'way', 'they', 'have', 'treated', 'me', 'over', 'the', 'past', '3', 'years', '(including', 'the', 'election', 'lead-up),', 'as', 'highlighted', 'by', 'the', 'disgraceful', 'Buzzfeed', 'story', '&amp;', 'the', 'even', 'more', 'disgraceful', 'coverage!'], ['The', 'Economy', 'is', 'one', 'of', 'the', 'best', 'in', 'our', 'history,', 'with', 'unemployment', 'at', 'a', '50', 'year', 'low,', 'and', 'the', 'Stock', 'Market

Q2: Write a function `compare(tokenization_one, tokenization_two)` that compares two tokenizations of the same text and finds the 20 most frequent tokens that don't appear in the other.



In [13]:
from collections import Counter 
def compare(tokenization_one, tokenization_two):
    flat_ls_one = [token for sentence in tokenization_one for token in sentence]
    flat_ls_two = [token for sentence in tokenization_two for token in sentence]
    word_freq_one = Counter(flat_ls_one)
    word_freq_two = Counter(flat_ls_two)
    diff_dict = {}
    diff_words = set(word_freq_one.keys()).difference(set(word_freq_two.keys()))
    
    for w in diff_words:
        if w in word_freq_one:
            diff_dict[w] = word_freq_one[w]
        else:
            diff_dict[w] = word_freq_two[w]
    
    ## Citation https://stackoverflow.com/questions/613183/how-do-i-sort-a-dictionary-by-value
    sort_ls = sorted(diff_dict.items(), key=lambda kv: kv[1], reverse = True) 
    return [tp[0] for tp in sort_ls[:20]]

In [15]:
compare(nltk_tokens, extensible_tokens)

['``',
 "''",
 'realDonaldTrump',
 "'s",
 "n't",
 '...',
 '--',
 'Trump2016',
 'U.S.',
 "'m",
 "'re",
 'CelebApprentice',
 'ApprenticeNBC',
 'Mr.',
 'MittRomney',
 "'ll",
 "'ve",
 'IvankaTrump',
 'w/',
 "'d"]

In [77]:
##Just checking the phrase is in which tokenizer
print(True in list(map(lambda x: "you're"  in x, nltk_casual_tokens)))
print(True in list(map(lambda x: "🇸"  in x, nltk_casual_tokens)))
print(True in list(map(lambda x: "#CelebApprentice"  in x, nltk_casual_tokens)))
print(True in list(map(lambda x: "IvankaTrump"  in x, whitespace_tokens)))



True
True
True
False


Q3: Use one of the NLTK tokenizers; write code to determine how many sentences are in this dataset, and what the average number of words per sentence is.



In [27]:
SEN_END = ['.', '!', '?']
NOT_PUNCTUATION = r'[a-zA-Z0-9]+' 
num_sentence = 0
num_words = 0
for tweet in tweets:
    tokens = nltk.word_tokenize(tweet, language="english")
    for word in tokens:
        if word in SEN_END:
            num_sentence += 1
        elif re.search(NOT_PUNCTUATION, word) != None:
            num_words += 1

ave_word_per_sentence = num_words/num_sentence
print("There are ", num_sentence, " sentences in the dataset.")
print("On average, each sentence has ", ave_word_per_sentence, " words.")

There are  57646  sentences in the dataset.
On average, each sentence has  11.844117545016132  words.


In [34]:
re.search(NOT_PUNCTUATION, ";") == None

True

Q4 (check-plus): modify the extensible tokenizer above to keep urls together (e.g., www.google.com or http://www.google.com)

In [35]:
# Keep usernames together (any token starting with @, followed by A-Z, a-z, 0-9)
regexes=(r"(?:@[\w_]+)",

# Keep hashtags together (any token starting with #, followed by A-Z, a-z, 0-9, _, or -)
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",

# Keep urls together
# FILL IN HERE
r"(?:http://?^www\.+\.com$)",
         
# Keep words with apostrophes, hyphens and underscores together
r"(?:[a-z][a-z’'\-_]+[a-z])",

# Keep all other sequences of A-Z, a-z, 0-9, _ together
r"(?:[\w_]+)",

# Everything else that's not whitespace
r"(?:\S)"
)

big_regex="|".join(regexes)

my_url_extensible_tokenizer = re.compile(big_regex, re.VERBOSE | re.I | re.UNICODE)

def my_extensible_tokenize_with_urls(text):
    return my_url_extensible_tokenizer.findall(text)

In [41]:
web = r"(?:^http://?www\.+\.com$)"

In [42]:
re.search(web, "The course website is http://www.people.ischool.berkeley.edu/~dbamman/info256.html")

In [36]:
print ('\n'.join(my_extensible_tokenize_with_urls("The course website is http://people.ischool.berkeley.edu/~dbamman/info256.html")))


The
course
website
is
http
:
/
/
people
.
ischool
.
berkeley
.
edu
/
~
dbamman
/
info
256
.
html
