In [3]:
import nltk

In [4]:
# get the book Emma from the Gutenberg collection and keep as raw text
file0 = nltk.corpus.gutenberg.fileids( ) [0]
emmatext = nltk.corpus.gutenberg.raw(file0)
print(type(emmatext))
print(len(emmatext))

<class 'str'>
887071


In [5]:
# print the first 20 characters in the str emmatext as one string
print(emmatext[:50])
emmatext[:50]

[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I





'[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\n'

In [6]:
# print the first 20 characters in emmatext by iterating over the characters
for c in emmatext[:20]:
  print(c)

[
E
m
m
a
 
b
y
 
J
a
n
e
 
A
u
s
t
e
n


In [7]:
## Review of strings and string operation +
string1 = 'Monty Python'
string2 = 'Holy Grail'
print(string1 + string2)
print(string1 + ' and the ' + string2)

Monty PythonHoly Grail
Monty Python and the Holy Grail


In [8]:
# replace end-of-line character with a space
# check table 3.2 in NLTK book for other string functions
newemmatext = emmatext.replace('\n', ' ')
newemmatext[:150]

'[Emma by Jane Austen 1816]  VOLUME I  CHAPTER I   Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to'

In [9]:
### Development of regular expressions for tokenizing text
import re

In [10]:
# pattern to match words, i.e. anything with a sequence of word characters, ignores special chars
shorttext = 'That book is interesting.'
pword = re.compile('\w+')
print(re.findall(pword, shorttext))

['That', 'book', 'is', 'interesting']


In [11]:
specialtext = 'That U.S.A. poster-print costs $12.40, but with 10% off.'
print(re.findall(pword, specialtext))

['That', 'U', 'S', 'A', 'poster', 'print', 'costs', '12', '40', 'but', 'with', '10', 'off']


In [12]:
# pattern to match words with internal hyphens
ptoken = re.compile('(\w+(-\w+)*)')
print(re.findall(ptoken, specialtext))
print(re.findall(ptoken, 'end-of-line character'))

[('That', ''), ('U', ''), ('S', ''), ('A', ''), ('poster-print', '-print'), ('costs', ''), ('12', ''), ('40', ''), ('but', ''), ('with', ''), ('10', ''), ('off', '')]
[('end-of-line', '-line'), ('character', '')]


In [13]:
# ignore the group of the inner parentheses 
ptoken = re.compile('(\w+(?:-\w+)*)')
print(re.findall(ptoken, specialtext))
print(re.findall(ptoken, 'end-of-line character'))

['That', 'U', 'S', 'A', 'poster-print', 'costs', '12', '40', 'but', 'with', '10', 'off']
['end-of-line', 'character']


In [14]:
# abbreviations like U.S.A.
pabbrev = re.compile('((?:[A-Z]\.)+)')
print(re.findall(pabbrev, specialtext))

['U.S.A.']


In [15]:
# combine this pattern with the words to make more general tokens
ptoken = re.compile('(\w+(?:-\w+)*|(?:[A-Z]\.)+)')
print(re.findall(ptoken, specialtext))

['That', 'U', 'S', 'A', 'poster-print', 'costs', '12', '40', 'but', 'with', '10', 'off']


In [17]:
# switch the order of the patterns to first match abbreviations and then other words
ptoken = re.compile('((?:[A-Z]\.)+|\w+(?:-\w+)*)')
print(re.findall(ptoken, specialtext))

['That', 'U.S.A.', 'poster-print', 'costs', '12', '40', 'but', 'with', '10', 'off']


In [14]:
# add expression for currency
ptoken = re.compile('((?:[A-Z]\.)+|\w+(?:-\w+)*|\$?\d+(?:\.\d+)?)')
print(re.findall(ptoken, specialtext))

['That', 'U.S.A.', 'poster-print', 'costs', '$12.40', 'but', 'with', '10', 'off']


In [21]:
# this is an equivalent regular expression except that it has extra parentheses
# the python string triple quote allows multi-line strings with end of line comments
ptoken = re.compile(r'''((?:[A-Z]\.)+) # abbreviations, e.g. U.S.A.
   | (\w+(?:-\w+)*) # words with internal hyphens
   | (\$?\d+(?:\.\d+)?) # currency, like $12.40
   ''', re.X) # verbose flag
print(re.findall(ptoken, specialtext))

[('', 'That', ''), ('U.S.A.', '', ''), ('', 'poster-print', ''), ('', 'costs', ''), ('', '', '$12.40'), ('', 'but', ''), ('', 'with', ''), ('', '10', ''), ('', 'off', '')]


In [22]:
### using NLTK's regular expression tokenizer
# first define a multi-line string that is a regular expression
pattern = r''' (?x) 	# set flag to allow verbose regexps
        (?:[A-Z]\.)+    # abbreviations, e.g. U.S.A.
        | \$?\d+(?:\.\d+)?%?    # currency and percentages, $12.40, 50%
        | \w+(?:-\w+)*  # words with internal hyphens
        | \.\.\.        # ellipsis
        | [][.,;”’?():-_%#’]    # separate tokens
        '''

In [24]:
# the nltk regular expression tokenizer compiles the re pattern, applies it to the text
#  and uses the matching groups to return a list of only the matched tokens
print(nltk.regexp_tokenize(shorttext, pattern))
print(nltk.regexp_tokenize(specialtext, pattern))

['That', 'book', 'is', 'interesting', '.']
['That', 'U.S.A.', 'poster-print', 'costs', '$12.40', ',', 'but', 'with', '10%', 'off', '.']


In [89]:
# compare with built-in word tokenizer
print(nltk.word_tokenize(specialtext))

['That', 'U.S.A.', 'poster-print', 'costs', '$', '12.40', ',', 'but', 'with', '10', '%', 'off', '.']


In [27]:
# Tokenizer for Twitter derived tweetmotif from the ARK, developed at CMU
tweetPattern = r''' (?x)	# set flag to allow verbose regexps
      (?:https?://|www)\S+      # simple URLs
      | (?::-\)|;-\))		# small list of emoticons
      | &(?:amp|lt|gt|quot);    # XML or HTML entity
      | \#\w+                 # hashtags
      | @\w+                  # mentions   
      | \d+:\d+               # timelike pattern
      | \d+\.\d+              # number with a decimal
      | (?:\d+,)+?\d{3}(?=(?:[^,]|$))   # number with a comma
      | (?:[A-Z]\.)+                    # simple abbreviations
      | (?:--+)               # multiple dashes
      | \w+(?:-\w+)*          # words with internal hyphens or apostrophes
      | ['\".?!,:;/]+         # special characters
      '''

In [28]:
# example tweets
tweet1 = "@natalieohayre I agree #hc09 needs reform- but not by crooked politicians who r clueless about healthcare! #tcot #fishy NO GOV'T TAKEOVER!"
tweet2 = "To Sen. Roland Burris: Affordable, quality health insurance can't wait http://bit.ly/j63je #hc09 #IL #60660"
tweet3 = "RT @karoli: RT @Seriou: .@whitehouse I will stand w/ Obama on #healthcare,  I trust him. #p2 #tlot"

In [29]:
print(nltk.regexp_tokenize(tweet1,tweetPattern))
print(nltk.regexp_tokenize(tweet2,tweetPattern))
print(nltk.regexp_tokenize(tweet3,tweetPattern))

['@natalieohayre', 'I', 'agree', '#hc09', 'needs', 'reform', 'but', 'not', 'by', 'crooked', 'politicians', 'who', 'r', 'clueless', 'about', 'healthcare', '!', '#tcot', '#fishy', 'NO', 'GOV', "'", 'T', 'TAKEOVER', '!']
['To', 'Sen', '.', 'Roland', 'Burris', ':', 'Affordable', ',', 'quality', 'health', 'insurance', 'can', "'", 't', 'wait', 'http://bit.ly/j63je', '#hc09', '#IL', '#60660']
['RT', '@karoli', ':', 'RT', '@Seriou', ':', '.', '@whitehouse', 'I', 'will', 'stand', 'w', '/', 'Obama', 'on', '#healthcare', ',', 'I', 'trust', 'him', '.', '#p2', '#tlot']


In [31]:
# NLTK built-in tokenizer (more detailed version from TweetMotif)
from nltk.tokenize import TweetTokenizer
ttokenizer = TweetTokenizer()
print(ttokenizer.tokenize(tweet1))

['@natalieohayre', 'I', 'agree', '#hc09', 'needs', 'reform', '-', 'but', 'not', 'by', 'crooked', 'politicians', 'who', 'r', 'clueless', 'about', 'healthcare', '!', '#tcot', '#fishy', 'NO', "GOV'T", 'TAKEOVER', '!']


In [88]:
sent = "Mr. Black and Mrs. Brown attended the lecture by Dr. Gray, but Gov. White wasn't there."
print(nltk.regexp_tokenize(sent, pattern))

['Mr.', 'Black ', 'and ', 'Mrs.', 'Brown ', 'attended ', 'the ', 'lecture ', 'by ', 'Dr.', 'Gray,', 'but ', 'Gov.', 'White ', "wasn'", 't ', 'there.']


## Lab Exercise
Choose one of the following, i.e. work with either the regular pattern or the tweet pattern in the
tokenizer.
1. Run the regexp tokenizer with the regular pattern on the sentence “Mr. Black and Mrs. Brown attended the lecture by Dr. Gray, but Gov. White wasn’t there.”
b. Design and add the pattern of this tokenizer so that words with a single
apostrophe, such as “wasn’t” are taken as a single token.
OR
2. Run the regexp tokenizer with the tweet pattern on the three example tweets.
a. Design and add a line to the pattern of this tokenizer so that titles like “Sen.” and
“Rep.” are tokenized as having the dot inside the token. Test and add some other
titles to your list of titles.
b. Design and add to the pattern of this tokenizer so that words with a single
apostrophe, such as “can’t” are taken as a single token.
c. Design and add to the pattern of this tokenizer so that the abbreviation “w/” is
taken as a single token.
Choose at least one of your tokenizer solutions and post your revised pattern to the Assignment
in Blackboard for Week 4, with a short example text that demonstrates its effect. Mention any
examples that you think of that need additional regular expressions to be tokenized.


In [102]:
#Run the regexp tokenizer with the regular pattern on 
#the sentence “Mr. Black and Mrs. Brown attended the lecture 
#by Dr. Gray, but Gov. White wasn’t there.”

tmp = "Mr. Black and Mrs. Brown attended the lecture by Dr. Gray, but Gov. White wasn’t there."

'''1. Design and add a line to the pattern of this tokenizer so that titles like “Mr.” 
are tokenized as having the dot inside the token. Test and add some other titles to your list of titles.
'''
pattern = r''' (?x) 	# set flag to allow verbose regexps
         \w+. #to allow  .
        | (?:[A-Z]\.)+    # abbreviations, e.g. U.S.A.
        | \$?\d+(?:\.\d+)?%?    # currency and percentages, $12.40, 50%
        | \w+(?:-\w+)*  # words with internal hyphens
        | \.\.\.        # ellipsis
        | [][.,;”’?():-_%#’]    # separate tokens
        '''
#print(nltk.regexp_tokenize(tmp, pattern))

'''b. Design and add the pattern of this tokenizer so that words with a single
apostrophe, such as “wasn't” are taken as a single token.
'''

pattern2 = r''' (?x) 	# set flag to allow verbose regexps
        (?:[A-Z]\.)+    # abbreviations, e.g. U.S.A.
        | \$?\d+(?:\.\d+)?%?    # currency and percentages, $12.40, 50%
        | \w+. #to allow
        | \w+(?:’\w+)*  # words with internal apostrophe
        | \w+(?:-\w+)*  # words with internal hyphens
        | \.\.\.        # ellipsis
        | [][.,;”’?():-_%#’]    # separate tokens

        '''
                              
print(nltk.regexp_tokenize(tmp, pattern2))





['Mr.', 'Black ', 'and ', 'Mrs.', 'Brown ', 'attended ', 'the ', 'lecture ', 'by ', 'Dr.', 'Gray,', 'but ', 'Gov.', 'White ', 'wasn’', 't ', 'there.']
