In [1]:
from nltk.book import *
from nltk import FreqDist

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
text1

<Text: Moby Dick by Herman Melville 1851>

In [3]:
sent1 
# a list of tokens of the first sentence of each text.

['Call', 'me', 'Ishmael', '.']

In [4]:
# Searching Text ------------------------------------

In [5]:
text1.concordance("monstrous")
# search for any word that you give to the function and show you the occurrences and some surrounding context.

Displaying 11 of 11 matches:
ong the former , one was of a most monstrous size . ... This came towards us , 
ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r
ll over with a heathenish array of monstrous clubs and spears . Some were thick
d as you gazed , and wondered what monstrous cannibal and savage could ever hav
that has survived the flood ; most monstrous and most mountainous ! That Himmal
they might scout at Moby Dick as a monstrous fable , or still worse and more de
th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l
ing Scenes . In connexion with the monstrous pictures of whales , I am strongly
ere to enter upon those still more monstrous stories of them which are to be fo
ght have been rummaged out of this monstrous cabinet there is no telling . But 
of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u


In [6]:
text1.similar("monstrous")
# finds all the words that are used in the same context as the one given, where the context is the word before and the word after.

true contemptible christian abundant few part mean careful puzzled
mystifying passing curious loving wise doleful gamesome singular
delightfully perilous fearless


In [7]:
# Counting Vocabulary ----------------------------

In [8]:
len(text3)

44764

In [9]:
sorted(set(text3))
# “set” : removes the repetitions
# “sorted” function sort

['!',
 "'",
 '(',
 ')',
 ',',
 ',)',
 '.',
 '.)',
 ':',
 ';',
 ';)',
 '?',
 '?)',
 'A',
 'Abel',
 'Abelmizraim',
 'Abidah',
 'Abide',
 'Abimael',
 'Abimelech',
 'Abr',
 'Abrah',
 'Abraham',
 'Abram',
 'Accad',
 'Achbor',
 'Adah',
 'Adam',
 'Adbeel',
 'Admah',
 'Adullamite',
 'After',
 'Aholibamah',
 'Ahuzzath',
 'Ajah',
 'Akan',
 'All',
 'Allonbachuth',
 'Almighty',
 'Almodad',
 'Also',
 'Alvah',
 'Alvan',
 'Am',
 'Amal',
 'Amalek',
 'Amalekites',
 'Ammon',
 'Amorite',
 'Amorites',
 'Amraphel',
 'An',
 'Anah',
 'Anamim',
 'And',
 'Aner',
 'Angel',
 'Appoint',
 'Aram',
 'Aran',
 'Ararat',
 'Arbah',
 'Ard',
 'Are',
 'Areli',
 'Arioch',
 'Arise',
 'Arkite',
 'Arodi',
 'Arphaxad',
 'Art',
 'Arvadite',
 'As',
 'Asenath',
 'Ashbel',
 'Asher',
 'Ashkenaz',
 'Ashteroth',
 'Ask',
 'Asshur',
 'Asshurim',
 'Assyr',
 'Assyria',
 'At',
 'Atad',
 'Avith',
 'Baalhanan',
 'Babel',
 'Bashemath',
 'Be',
 'Because',
 'Becher',
 'Bedad',
 'Beeri',
 'Beerlahairoi',
 'Beersheba',
 'Behold',
 'Bela',
 'Belah

In [10]:
# percentage of the total number of tokens to the number of unique tokens
len(text3) / len(set(text3))

16.050197203298673

In [11]:
# occurrences of particular words
text3.count("smote")

5

In [12]:
# percentage of the number of occurrences of the word compared with the total number of words
100 * text3.count('smote') / len(text3)

0.01116968992940756

In [13]:
# Processing Text -------------------------------

In [14]:
import nltk
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [15]:
# save the first fileid (number 0 in the list) into a variable
file1 = nltk.corpus.gutenberg.fileids( ) [0]
file1

'austen-emma.txt'

In [16]:
# get the original text
emmatext = nltk.corpus.gutenberg.raw(file1)
len(emmatext)

887071

In [17]:
emmatext[:120] # the first 120 characters

'[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nan'

In [18]:
# break the raw text into tokens
# wordpunct_tokenize separates by white space and by special characters (punctuation)

emmatokens = nltk.wordpunct_tokenize(emmatext)
len(emmatokens)

192427

In [19]:
emmatokens[:10]

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER']

In [20]:
# convert to lower case
emmawords = [w.lower( ) for w in emmatokens]
emmawords[:10]

['[', 'emma', 'by', 'jane', 'austen', '1816', ']', 'volume', 'i', 'chapter']

In [21]:
print("Length of emma words = " + str(len(emmawords)))

# getting the unique words and sorting them
emmavocab = sorted(set(emmawords))
print("Length of emmavocab = " + str(len(emmavocab)))

Length of emma words = 192427
Length of emmavocab = 7344


In [22]:
# Accessing text corpora

In [23]:
# loads the Brown Corpus into the system memory
from nltk.corpus import brown
print ('Total Categories:', str(len(brown.categories())))

Total Categories: 15


In [24]:
# shows the various available categories
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [25]:
# get the content of the category -> 'mystery'
content = brown.sents(categories='mystery')
content

[['There', 'were', 'thirty-eight', 'patients', 'on', 'the', 'bus', 'the', 'morning', 'I', 'left', 'for', 'Hanover', ',', 'most', 'of', 'them', 'disturbed', 'and', 'hallucinating', '.'], ['An', 'interne', ',', 'a', 'nurse', 'and', 'two', 'attendants', 'were', 'in', 'charge', 'of', 'us', '.'], ...]

In [26]:
# get the POS tags
tagged_sentences = brown.tagged_sents(categories='mystery')
tagged_sentences

[[('There', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN'), ('the', 'AT'), ('bus', 'NN'), ('the', 'AT'), ('morning', 'NN'), ('I', 'PPSS'), ('left', 'VBD'), ('for', 'IN'), ('Hanover', 'NP'), (',', ','), ('most', 'AP'), ('of', 'IN'), ('them', 'PPO'), ('disturbed', 'VBN'), ('and', 'CC'), ('hallucinating', 'VBG'), ('.', '.')], [('An', 'AT'), ('interne', 'NN'), (',', ','), ('a', 'AT'), ('nurse', 'NN'), ('and', 'CC'), ('two', 'CD'), ('attendants', 'NNS'), ('were', 'BED'), ('in', 'IN'), ('charge', 'NN'), ('of', 'IN'), ('us', 'PPO'), ('.', '.')], ...]

In [27]:
# get sentences in natural form
sentences = [' '.join(sentence_token) for sentence_token in content]
print(sentences[0:5])

['There were thirty-eight patients on the bus the morning I left for Hanover , most of them disturbed and hallucinating .', 'An interne , a nurse and two attendants were in charge of us .', "I felt lonely and depressed as I stared out the bus window at Chicago's grim , dirty West Side .", 'It seemed incredible , as I listened to the monotonous drone of voices and smelled the fetid odors coming from the patients , that technically I was a ward of the state of Illinois , going to a hospital for the mentally ill .', 'I suddenly thought of Mary Jane Brennan , the way her pretty eyes could flash with anger , her quiet competence , the gentleness and sweetness that lay just beneath the surface of her defenses .']


In [28]:
# get the top nouns in the mystery genre
# nouns have either NN or NP in POS tag to indicate the various forms.
tagged_words = brown.tagged_words(categories='mystery')
print('first 5 tagged words: ')
tagged_words[0:5]

first 5 tagged words: 


[('There', 'EX'),
 ('were', 'BED'),
 ('thirty-eight', 'CD'),
 ('patients', 'NNS'),
 ('on', 'IN')]

In [29]:
nouns = [(word, tag) for word, tag in tagged_words if any(noun_tag in tag for noun_tag in ['NP', 'NN'])]
print('first 5 nouns: ')
nouns[0:5]

first 5 nouns: 


[('patients', 'NNS'),
 ('bus', 'NN'),
 ('morning', 'NN'),
 ('Hanover', 'NP'),
 ('interne', 'NN')]

In [30]:
# The Reuters Corpus categories are grouped into train and test sets.
from nltk.corpus import reuters
print(reuters.fileids(categories=['housing', 'income']))

['test/16118', 'test/18534', 'test/18540', 'test/18664', 'test/18665', 'test/18672', 'test/18911', 'test/19875', 'test/20106', 'test/20116', 'training/1035', 'training/1036', 'training/10602', 'training/10604', 'training/11170', 'training/11665', 'training/2618', 'training/29', 'training/3105', 'training/3708', 'training/3720', 'training/3723', 'training/3898', 'training/5883', 'training/5886', 'training/6000', 'training/6067', 'training/6197', 'training/7005', 'training/7006', 'training/7015', 'training/7036', 'training/7098', 'training/7099', 'training/9615']


In [31]:
print(reuters.sents(fileids=[u'test/16118',	u'test/18534']))

[['YUGOSLAV', 'ECONOMY', 'WORSENED', 'IN', '1986', ',', 'BANK', 'DATA', 'SHOWS', 'National', 'Bank', 'economic', 'data', 'for', '1986', 'shows', 'that', 'Yugoslavia', "'", 's', 'trade', 'deficit', 'grew', ',', 'the', 'inflation', 'rate', 'rose', ',', 'wages', 'were', 'sharply', 'higher', ',', 'the', 'money', 'supply', 'expanded', 'and', 'the', 'value', 'of', 'the', 'dinar', 'fell', '.'], ['The', 'trade', 'deficit', 'for', '1986', 'was', '2', '.', '012', 'billion', 'dlrs', ',', '25', '.', '7', 'pct', 'higher', 'than', 'in', '1985', '.'], ...]


In [32]:
# WordNet corpus has words that are semantically linked synsets
from nltk.corpus import wordnet as wn
word = 'hike' # taking hike as our word of interest

# get word synsets
word_synsets = wn.synsets(word)
print(word_synsets)

[Synset('hike.n.01'), Synset('rise.n.09'), Synset('raise.n.01'), Synset('hike.v.01'), Synset('hike.v.02')]


In [33]:
# get details for each synonym in synset
for synset in word_synsets:
    print('Synset Name:', synset.name())
    print('POS Tag:', synset.pos())
    print('Definition:', synset.definition())
    print('Examples:', synset.examples())
    print()

Synset Name: hike.n.01
POS Tag: n
Definition: a long walk usually for exercise or pleasure
Examples: ['she enjoys a hike in her spare time']

Synset Name: rise.n.09
POS Tag: n
Definition: an increase in cost
Examples: ['they asked for a 10% rise in rates']

Synset Name: raise.n.01
POS Tag: n
Definition: the amount a salary is increased
Examples: ['he got a 3% raise', 'he got a wage hike']

Synset Name: hike.v.01
POS Tag: v
Definition: increase
Examples: ['The landlord hiked up the rents']

Synset Name: hike.v.02
POS Tag: v
Definition: walk a long way, as for pleasure or physical exercise
Examples: ['We were hiking in Colorado', 'hike the Rockies']



In [34]:
# Frequency Distributions (Optional)

In [35]:
# the set of keys are all the words, and the set of values are the frequency (count) of each word

fdist = FreqDist(emmawords)
fdist.most_common(10)

[(',', 11454),
 ('.', 6928),
 ('to', 5239),
 ('the', 5201),
 ('and', 4896),
 ('of', 4291),
 ('i', 3178),
 ('a', 3129),
 ('it', 2528),
 ('her', 2469)]

In [36]:
# the frequencies of individual words
fdist['emma']

865

In [37]:
# “makeAlphaFreqDist” 
# takes a list of words as an argument
# returns a Frequency Distribution which only contains words with all alphabetical characters.
import re

def makeAlphaFreqDist(words):
    adist = FreqDist() # Make a new empty frequency distribution called adist
    pattern = re.compile('.*[^a---z].*') # match any word that contains a non-alphabetical character
    
    for word in words:
        if not pattern.match(word): # doesn’t contain any non-alphabetical characters
            adist.update([word]) # add it to the frequency distribution
    return adis

# .update -: adds the word and adds one to its count

In [38]:
len(emmawords)

192427

In [39]:
# adist = makeAlphaFreqDist(emmawords)
# common_words = adist.most_common(10)
# print(str(len(common_words)))

# for word, freq in adist.most_common(10):
#     print(word, freq)

In [40]:
# Counting words and n-grams

In [41]:
from nltk import FreqDist

file0 = nltk.corpus.gutenberg.fileids( ) [0]
emmatext = nltk.corpus.gutenberg.raw(file0) # get the text of the book Emma
emmatokens = nltk.wordpunct_tokenize(emmatext) # separates emmatext into tokens
emmawords = [w.lower( ) for w in emmatokens] # converts all the characters to lower case
shortwords = emmawords[11:111] # a list with only the first 101 words
shortwords[:10]

['emma', 'woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',']

In [42]:
shortdist = FreqDist(shortwords) # create a frequency distribution of the words
words_in_decreasing_freq = shortdist.keys( ) # produces the list of words in order of decreasing frequency

In [43]:
for word in list(words_in_decreasing_freq)[:10]:
    print (word, shortdist[word])

emma 1
woodhouse 1
, 8
handsome 1
clever 1
and 4
rich 1
with 2
a 3
comfortable 1


In [44]:
# More Specialized Frequency Distributions (What is a word?)

In [45]:
# WordPunct tokenization produces tokens that have special characters
# Following code remove all the tokens that have special characters and leave only tokens that consist of all alphabetic characters

pattern = re.compile('.*[^a-z].*') # matches any token that contains a non-alphabetical character
nonAlphaMatch = pattern.match('-')

print('nonAlphaMatch to -')
nonAlphaMatch

nonAlphaMatch to -


<re.Match object; span=(0, 1), match='-'>

In [46]:
for word in text1[:10]:
    if nonAlphaMatch: 
        print (word, ': matched non-alphabetical')
    else:
        print (word, ': NOT matched')

[ : matched non-alphabetical
Moby : matched non-alphabetical
Dick : matched non-alphabetical
by : matched non-alphabetical
Herman : matched non-alphabetical
Melville : matched non-alphabetical
1851 : matched non-alphabetical
] : matched non-alphabetical
ETYMOLOGY : matched non-alphabetical
. : matched non-alphabetical


In [47]:
# stop words-: list of the common words that appear with great frequenc

stopwords = ['to', 'be', 'of', 'the', 'in', 'it', 'was',
'i', 'am', 'she', 'had', 'been', 'is', 'have','could', 'not',
'her', 'he', 'do', 'and', 'would', 'such', 'a', 'his', 'must']

In [48]:
# make a frequency distribution from a list of tokens that has no tokens 
# containing non-alphabetical characters or words in the stopword list.

def alphaStopFreqDist(words, stoplist):
    asdist = FreqDist()
    pattern = re.compile('.*[^a-z].*')
    for word in words:
        if not pattern.match(word):
            if not word in stoplist:
                asdist[word.lower()] += 1
    return asdist

In [49]:
asdist = alphaStopFreqDist(shortwords, stopwords)
list(asdist.keys())[:10]

['emma',
 'woodhouse',
 'handsome',
 'clever',
 'rich',
 'with',
 'comfortable',
 'home',
 'happy',
 'disposition']

In [50]:
for key in list(asdist.keys())[:10]:
        print (key, asdist[key])

emma 1
woodhouse 1
handsome 1
clever 1
rich 1
with 2
comfortable 1
home 1
happy 1
disposition 1


In [51]:
# Bigram Frequency Distributions

# look at pairs of words that are frequently collocated
# WORDS THAT occur in a sequence called a 'bigram'.

In [52]:
def bigramDist(words, stoplist):
    biDist = FreqDist()
    uniDist = alphaStopFreqDist(words, stoplist) # restrict words to those that occur in a unigram/word frequency distribution
# without non-alphabetical characters and stop words.
    for i in range(1, len(words)):
        if words[i-1] in uniDist and words[i] in uniDist:
            biword = words[i-1] + ' ' + words[i]
            biDist[biword.lower()] += 1
    return biDist

In [53]:
# Try out bigram function on shortwords and emmawords

shortbidist = bigramDist(shortwords, stopwords)
shortbidist.keys()

dict_keys(['emma woodhouse', 'comfortable home', 'happy disposition', 'unite some', 'best blessings', 'lived nearly', 'nearly twenty', 'one years', 'world with', 'with very', 'very little', 'distress or', 'or vex', 'two daughters', 'most affectionate', 'indulgent father', 's marriage', 'house from', 'very early', 'early period', 'died too', 'too long', 'long ago', 'ago for'])

In [54]:
emmabidist = bigramDist(emmawords, stopwords)
for key in list(emmabidist.keys())[:10]:
    print (key, emmabidist[key])

emma by 1
by jane 2
jane austen 1
emma woodhouse 5
comfortable home 2
happy disposition 1
unite some 1
best blessings 2
lived nearly 1
nearly twenty 1


In [55]:
# Stemming and Lemmatization

In [56]:
porter = nltk.PorterStemmer()
ancaster = nltk.LancasterStemmer()

In [57]:
emmaregstem = [porter.stem(t) for t in emmatokens]
emmaregstem[:10]

['[', 'emma', 'by', 'jane', 'austen', '1816', ']', 'volum', 'i', 'chapter']

In [58]:
emmalowerstem = [porter.stem(t) for t in emmawords]
emmalowerstem[:10]

['[', 'emma', 'by', 'jane', 'austen', '1816', ']', 'volum', 'i', 'chapter']

In [59]:
emmaregstem = [ancaster.stem(t) for t in emmatokens]
emmaregstem[:10]

['[', 'emm', 'by', 'jan', 'aust', '1816', ']', 'volum', 'i', 'chapt']

In [60]:
emmalowerstem = [ancaster.stem(t) for t in emmawords]
emmalowerstem[:10]

['[', 'emm', 'by', 'jan', 'aust', '1816', ']', 'volum', 'i', 'chapt']

In [61]:
# our own stemmer by making a list of suffixes to take off.

def stem(word):
    for suffix in ['ing','ly','ed','ious','ies','ive','es','s']:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

stemmedword=stem('friends')
stemmedword

'friend'

In [62]:
wnl = nltk.WordNetLemmatizer()
emmalemma=[wnl.lemmatize(t) for t in emmawords]
emmalemma[:10]

['[', 'emma', 'by', 'jane', 'austen', '1816', ']', 'volume', 'i', 'chapter']

In [63]:
# Regular Expressions and Tokenization

In [64]:
import nltk
from nltk import *
file0 = nltk.corpus.gutenberg.fileids( ) [0]
emmatext = nltk.corpus.gutenberg.raw(file0)

type(emmatext)

str

In [65]:
shorttext = emmatext[:150]
for char in shorttext[:10]:
    print (char)
# Strings can be treated as lists of characters

[
E
m
m
a
 
b
y
 
J


In [66]:
# concatenate strings together
string1 = 'Monty Python'
string2 = 'Holy Grail'
string1 + string2
string1 + ' and the ' + string2

'Monty Python and the Holy Grail'

In [67]:
# replace all the new characters ‘\n’ with a space ' '.
newemmatext = emmatext.replace('\n', ' ')
shorttext = newemmatext[:150]
shorttext

'[Emma by Jane Austen 1816]  VOLUME I  CHAPTER I   Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to'

In [68]:
# Regular Expressions for Tokenizing Text
# re.match -: finds any match at the beginning of a string
# re.search -: finds a match anywhere in the string, and
# re.findall -: find the substrings that matched anywhere in the string.

In [69]:
import re
pword = re.compile('\w+') # find on the alphabetic words of this simple text
re_pwd = re.findall(pword, shorttext)
len(re_pwd)

24

In [70]:
specialtext = 'U.S.A. poster-print costs $12.40, with 10% off.'
re.findall(pword, specialtext)

['U', 'S', 'A', 'poster', 'print', 'costs', '12', '40', 'with', '10', 'off']

In [71]:
# matching words can have an internal hyphen
ptoken = re.compile('(\w+(-\w+)*)')
# put parentheses around the part of the pattern that can be repeated >=0
# But, then findall will only report the part that matched inside those parentheses,
# So put an extra pair of parentheses around the whole match.
re.findall(ptoken, specialtext)

[('U', ''),
 ('S', ''),
 ('A', ''),
 ('poster-print', '-print'),
 ('costs', ''),
 ('12', ''),
 ('40', ''),
 ('with', ''),
 ('10', ''),
 ('off', '')]

In [72]:
# re.findall reports both the whole matched text and the internal matched text. 
# can fix it by using the re.groups function to access only the outer match

In [73]:
# match abbreviations that might have a “.” inside
pabbrev = re.compile('(([A-Z]\.)+)')
re.findall(pabbrev, specialtext)

[('U.S.A.', 'A.')]

In [74]:
# match either words or abbreviations
ptoken = re.compile('(\w+(-\w+)* | ([A-Z]\.)+)')
re.findall(ptoken, specialtext)

[('poster-print ', '-print', ''), ('costs ', '', ''), ('with ', '', '')]

In [75]:
ptoken = re.compile('(([A-Z]\.)+|\w+(-\w+)*)')
re.findall(ptoken, specialtext)

[('U.S.A.', 'A.', ''),
 ('poster-print', '', '-print'),
 ('costs', '', ''),
 ('12', '', ''),
 ('40', '', ''),
 ('with', '', ''),
 ('10', '', ''),
 ('off', '', '')]

In [76]:
# That didn’t work because it first found the alphabetic words which found ‘U’, ‘S’ and ‘A’ as separate words before it could match the abbreviations.
# Therefore, the order of the matching patterns is important

In [77]:
ptoken = re.compile(r'(([A-Z]\.)+|\w+(-\w+)*|\$?\d+(\.\d+)?)')
output = re.findall(ptoken, specialtext)
print(output)

[('U.S.A.', 'A.', '', ''), ('poster-print', '', '-print', ''), ('costs', '', '', ''), ('$12.40', '', '', '.40'), ('with', '', '', ''), ('10', '', '', ''), ('off', '', '', '')]


In [78]:
# ‘r’ in front of the pattern -: Python’s notation for a raw string. This accepts ‘\’ as itself in a string

In [79]:
# an expression to match the currency
ptoken = re.compile(r'''([A-Z]\.)+
    | \w+(-\w+)* 
    | \$?\d+(\.\d+)? 
    ''', re.X)

re.findall(ptoken, specialtext)

[('A.', '', ''),
 ('', '-print', ''),
 ('', '', ''),
 ('', '', '.40'),
 ('', '', ''),
 ('', '', ''),
 ('', '', '')]

In [80]:
# Regular Expression Tokenizer using NLTK Tokenizer

# Regular expressions can also be written down in the “verbose” version, using the (?x) flag 
# allows the alternatives to be on different lines with comments
# alleviates the need to put extra parentheses.

In [81]:
pattern = r''' (?x) # set flag to allow verbose regexps
    ([A-Z]\.)+ # abbreviations, e.g. U.S.A
    | \w+(-\w+)*  # words with internal hyphens
    | \$?\d+(\.\d+)?%?  # currency and percentages, $12.40, 50%
    | \.\.\. # ellipsis
    | [][.,;"'?():-_'] # separate special character tokens
    '''
pattern

' (?x) # set flag to allow verbose regexps\n    ([A-Z]\\.)+ # abbreviations, e.g. U.S.A\n    | \\w+(-\\w+)*  # words with internal hyphens\n    | \\$?\\d+(\\.\\d+)?%?  # currency and percentages, $12.40, 50%\n    | \\.\\.\\. # ellipsis\n    | [][.,;"\'?():-_\'] # separate special character tokens\n    '

In [82]:
# the expression to separate special characters as individual tokens comes last in the list
# because other expressions, such as the words with internal hyphens, can first get longer tokens that involve individual characters

In [83]:
nltk.regexp_tokenize(shorttext, pattern)

[('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', '')]

In [84]:
nltk.regexp_tokenize(specialtext, pattern)

[('A.', '', ''),
 ('', '-print', ''),
 ('', '', ''),
 ('', '', '.40'),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', ''),
 ('', '', '')]

In [85]:
# In baove two code segments if there are any characters not matched by one of the regular expression patterns, then it is omitted as a token in the result.

In [86]:
tweetPattern = r''' (?x) # set flag to allow verbose regexps (https?://|www)\S+ # simple URLs
    | (:-\)|;-\)) # small list of emoticons
    | &(amp|lt|gt|quot); # XML or HTML entity | \#\w+ # hashtags | @\w+ # mentions
    | \d+:\d+ # timelike pattern | \d+\.\d+ # number with a decimal
    | (\d+,)+?\d{3}(?=([^,]|$)) # number with a comma
    | ([A-Z]\.)+ # simple abbreviations
    | (--+) # multiple dashes
    | \w+(-\w+)* # words with internal hyphens or apostrophes | ['\".?!,:;]+ # special characters
    '''

In [87]:
tweet1 = "@natalieohayre I agree #hc09 needs reform- but not by crooked politicians who r clueless about healthcare! #tcot #fishy NO GOV'T TAKEOVER!"

tweet2 = "To Sen. Roland Burris: Affordable, quality health insurance can't wait http://bit.ly/j63je #hc09 #IL #60660"

tweet3 = "RT @karoli: RT @Seriou: .@whitehouse I will stand w/ Obama on #healthcare, I trust him. #p2 #tlot"

In [88]:
nltk.regexp_tokenize(tweet1, tweetPattern)

[('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', ''

In [89]:
nltk.regexp_tokenize(tweet2, tweetPattern)

[('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', ''

In [90]:
nltk.regexp_tokenize(tweet3, tweetPattern)

[('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', '', '', '', '', '', ''),
 ('', ''