<h1>3 Processing Raw Text</h1>

<h3>Imports</h3>

In [2]:
import nltk, re, pprint
from nltk import word_tokenize

<h1>3.1 Accessing Text from the Web and from Disk</h1>

<h1>Electronic Books</h1>

In [3]:
from urllib import request

# Go to url of Crime and Punishment
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
# Save the response in a string
# To avoid getting "ufeff" in raw string, include sig
# to specify encode with BOM
raw = response.read().decode("utf-8-sig")


In [4]:
print("Characters in this text: ", len(raw))
print(raw[:75])


Characters in this text:  1176964
The Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky



<h3>Tokenize</h3>

In [5]:
tokens = word_tokenize(raw)

In [6]:
print(type(tokens))
print(len(tokens))
print(tokens[:10])

<class 'list'>
257726
['The', 'Project', 'Gutenberg', 'EBook', 'of', 'Crime', 'and', 'Punishment', ',', 'by']


<h3>Create a Text Object for the Raw Text</h3>

In [7]:
text = nltk.Text(tokens)
print(type(text))
print(text[1024:1062])
print(text.collocations())

<class 'nltk.text.Text'>
['an', 'exceptionally', 'hot', 'evening', 'early', 'in', 'July', 'a', 'young', 'man', 'came', 'out', 'of', 'the', 'garret', 'in', 'which', 'he', 'lodged', 'in', 'S.', 'Place', 'and', 'walked', 'slowly', ',', 'as', 'though', 'in', 'hesitation', ',', 'towards', 'K.', 'bridge', '.', 'He', 'had', 'successfully']
Katerina Ivanovna; Pyotr Petrovitch; Pulcheria Alexandrovna; Avdotya
Romanovna; Rodion Romanovitch; Marfa Petrovna; Sofya Semyonovna; old
woman; Project Gutenberg-tm; Porfiry Petrovitch; Amalia Ivanovna;
great deal; young man; Nikodim Fomitch; Ilya Petrovitch; Project
Gutenberg; Andrey Semyonovitch; Hay Market; Dmitri Prokofitch; Good
heavens
None


<h3>Find Indices In String Where Keywords/Phrases Occur</h3>

In [8]:
start = raw.find("PART I")
end = raw.rfind("End of Project Gutenberg’s")
print(start)
print(end)


5335
1157809


In [9]:
n_raw = raw[start:end]
# print(n_raw)

<h1>Dealing with HTML</h1>

In [10]:
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = request.urlopen(url).read().decode("utf-8")
print(html[:60])

<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN


In [11]:
from bs4 import BeautifulSoup
# Include "lxml" in the arguments to explicitly
# specify a parser to be used
raw = BeautifulSoup(html, "lxml").get_text()
tokens = word_tokenize(raw)
print(tokens)

['BBC', 'NEWS', '|', 'Health', '|', 'Blondes', "'to", 'die', 'out', 'in', '200', "years'", 'NEWS', 'SPORT', 'WEATHER', 'WORLD', 'SERVICE', 'A-Z', 'INDEX', 'SEARCH', 'You', 'are', 'in', ':', 'Health', 'News', 'Front', 'Page', 'Africa', 'Americas', 'Asia-Pacific', 'Europe', 'Middle', 'East', 'South', 'Asia', 'UK', 'Business', 'Entertainment', 'Science/Nature', 'Technology', 'Health', 'Medical', 'notes', '--', '--', '--', '--', '--', '--', '-', 'Talking', 'Point', '--', '--', '--', '--', '--', '--', '-', 'Country', 'Profiles', 'In', 'Depth', '--', '--', '--', '--', '--', '--', '-', 'Programmes', '--', '--', '--', '--', '--', '--', '-', 'SERVICES', 'Daily', 'E-mail', 'News', 'Ticker', 'Mobile/PDAs', '--', '--', '--', '--', '--', '--', '-', 'Text', 'Only', 'Feedback', 'Help', 'EDITIONS', 'Change', 'to', 'UK', 'Friday', ',', '27', 'September', ',', '2002', ',', '11:51', 'GMT', '12:51', 'UK', 'Blondes', "'to", 'die', 'out', 'in', '200', "years'", 'Scientists', 'believe', 'the', 'last', 'blond

In [12]:
tokens = tokens[110:390]
text = nltk.Text(tokens)
text.concordance("gene")

Displaying 5 of 5 matches:
hey say too few people now carry the gene for blondes to last beyond the next 
blonde hair is caused by a recessive gene . In order for a child to have blond
 have blonde hair , it must have the gene on both sides of the family in the g
ere is a disadvantage of having that gene or by chance . They do n't disappear
des would disappear is if having the gene was a disadvantage and I do not thin


<h1>Processing RSS Feeds</h1>

In [13]:
import feedparser

llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")


In [14]:
# Title of the feed
print(llog["feed"]["title"])

# How many entries
print(len(llog.entries))

# Grab the first post
post0 = llog.entries[0]
print(post0.title)

# Grab the HTML of the first post
post0_content = post0.content[0].value
print(post0_content[:100])

# Extract the text from the HTML
raw = BeautifulSoup(post0_content, "lxml").get_text()
tokens = word_tokenize(raw)
print(tokens[:20])

Language Log
13
The harmonics of &#039;entitlement&#039;
<p>A lot of the most effective political keywords derive their force from a maneuver akin to what <a
['A', 'lot', 'of', 'the', 'most', 'effective', 'political', 'keywords', 'derive', 'their', 'force', 'from', 'a', 'maneuver', 'akin', 'to', 'what', 'H.', 'W.', 'Fowler']


<h1>Reading Local Files</h1>

In [15]:
# Open the text file
love_song_path = "../My-Texts/the-love-song-of-j-alfred-prufrock.txt"
love_song = open(love_song_path, 'r', encoding="utf")

# Store the text in a string
love_song_raw = love_song.read()

# Tokenize the text
love_song_tokens = word_tokenize(love_song_raw)

# Normalize the words
# Remove punctuation
love_song_tokens = [w.lower() for w in love_song_tokens if w.isalnum()]
# print(love_song_tokens[:100])

# Grab all unique vocab
love_song_vocab = sorted(set(love_song_tokens))

# Find unique vocabulary
print(love_song_vocab[:10])
print("\nUnique Vocab: ", len(love_song_vocab))

['a', 'about', 'across', 'advise', 'afraid', 'after', 'afternoon', 'afternoons', 'against', 'al']

Unique Vocab:  435


<h3>The NLP Pipeline</h3>

<img src="../Images/pipeline1.png">

<h1>3.2 Strings</h1>

In [16]:
from nltk.corpus import gutenberg

In [17]:
raw = gutenberg.raw("melville-moby_dick.txt")
fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())
print(fdist.most_common(5))

# Grab the characters in order of most->least frequent
ordered_chars = [c for (c, freq) in fdist.most_common()]
print(ordered_chars)

[('e', 117092), ('t', 87996), ('a', 77916), ('o', 69326), ('n', 65617)]
['e', 't', 'a', 'o', 'n', 'i', 's', 'h', 'r', 'l', 'd', 'u', 'm', 'c', 'w', 'f', 'g', 'p', 'b', 'y', 'v', 'k', 'q', 'j', 'x', 'z']


<h1>3.3 Text Processing with Unicode</h1>

<img src="../Images/unicode.png">

<h3>Extracting Encoded Text from Files</h3>

In [18]:
path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')
f = open(path, encoding="latin2")
for line in f:
    line = line.strip()
    print(line)


Pruska Biblioteka Państwowa. Jej dawne zbiory znane pod nazwą
"Berlinka" to skarb kultury i sztuki niemieckiej. Przewiezione przez
Niemców pod koniec II wojny światowej na Dolny Śląsk, zostały
odnalezione po 1945 r. na terytorium Polski. Trafiły do Biblioteki
Jagiellońskiej w Krakowie, obejmują ponad 500 tys. zabytkowych
archiwaliów, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.


In [19]:
# Convert all non-ASCII characters to 2-digit
# and 4 digit representations:
# \xXX and \uXXXX
f = open(path, encoding="latin2")
for line in f:
    line = line.strip()
    print(line.encode("unicode_escape"))

b'Pruska Biblioteka Pa\\u0144stwowa. Jej dawne zbiory znane pod nazw\\u0105'
b'"Berlinka" to skarb kultury i sztuki niemieckiej. Przewiezione przez'
b'Niemc\\xf3w pod koniec II wojny \\u015bwiatowej na Dolny \\u015al\\u0105sk, zosta\\u0142y'
b'odnalezione po 1945 r. na terytorium Polski. Trafi\\u0142y do Biblioteki'
b'Jagiello\\u0144skiej w Krakowie, obejmuj\\u0105 ponad 500 tys. zabytkowych'
b'archiwali\\xf3w, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.'


In [20]:
nacute = 'ń'
# Find integer ordinals of characters
print("Ord of char: ", ord(nacute))

# Find char representation of int value
print("Char of int: ", chr(324))

# Find char representation of hex value
print("Char of hex: ", chr(0x144))

Ord of char:  324
Char of int:  ń
Char of hex:  ń


In [21]:
nacute.encode("utf8")


b'\xc5\x84'

In [22]:
# Inspect properties of Unicode characters
import unicodedata

# Load in the text
lines = open(path, encoding="latin2").readlines()

# Grab the 3rd line
line = lines[2]

print(line.encode("unicode_escape"))

for c in line:
    # If c is outside of the normal ASCII range
    if ord(c) > 127:
        # UTF-8 encoding == decoded character | U+hex int | unicode name
#         print(type(c.encode("utf8")))
        print("{} == {} | U+{:04x} {}".format(c.encode("utf8"), c,
                                      ord(c), unicodedata.name(c)))

b'Niemc\\xf3w pod koniec II wojny \\u015bwiatowej na Dolny \\u015al\\u0105sk, zosta\\u0142y\\n'
b'\xc3\xb3' == ó | U+00f3 LATIN SMALL LETTER O WITH ACUTE
b'\xc5\x9b' == ś | U+015b LATIN SMALL LETTER S WITH ACUTE
b'\xc5\x9a' == Ś | U+015a LATIN CAPITAL LETTER S WITH ACUTE
b'\xc4\x85' == ą | U+0105 LATIN SMALL LETTER A WITH OGONEK
b'\xc5\x82' == ł | U+0142 LATIN SMALL LETTER L WITH STROKE


In [23]:
# Grab all words
words = [w for w in word_tokenize(line) if w.isalnum()]
# Grab the last word
final_word = words[-1]
decoded = ""

decoded = "zosta\u0142y"
print(line)
print(decoded)
line.find(decoded)

line.encode("unicode_escape")

# Search line with a regular expression
import re
m = re.search("\u015b\w*", line)
print(m.group())


Niemców pod koniec II wojny światowej na Dolny Śląsk, zostały

zostały
światowej


<h3>NLTK tokenizers allow and yield Unicode</h3>

In [24]:
word_tokenize(line)

['Niemców',
 'pod',
 'koniec',
 'II',
 'wojny',
 'światowej',
 'na',
 'Dolny',
 'Śląsk',
 ',',
 'zostały']

<h1>3.4 Regular Expressions for Detecting Word Patterns</h1>

In [25]:
import re
wordlist = [w for w in nltk.corpus.words.words("en") if w.islower()]
print(wordlist[:10])

['a', 'aa', 'aal', 'aalii', 'aam', 'aardvark', 'aardwolf', 'aba', 'abac', 'abaca']


<h3>Basic Meta-Characters</h3>

In [26]:
# re.search(<pattern>, <string>)
# $ == end of word
pat1 = r"ed$"
match_p1 = [w for w in wordlist if re.search(pat1, w)]
print(match_p1[:10], end='\n\n')

# . == wildcard (i.e. any single character)
# ^ == start of string
pat2 = r"^..j..t..$"
match_p2 = [w for w in wordlist if re.search(pat2, w)]
print(match_p2[:10], end='\n\n')


# ? == previous character optional
email = ["email", "e-mail", "e mail", "e;mail"]
pat_email = r"^e[-/ /;/]?mail$"
match_email = [w for w in email if re.search(pat_email, w)]
print(match_email)

['abaissed', 'abandoned', 'abased', 'abashed', 'abatised', 'abed', 'aborted', 'abridged', 'abscessed', 'absconded']

['abjectly', 'adjuster', 'dejected', 'dejectly', 'injector', 'majestic', 'objectee', 'objector', 'rejecter', 'rejector']

['email', 'e-mail', 'e mail', 'e;mail']


<h3>Ranges and Closures</h3>

In [27]:
# T9 system (used for entering text on mobile phones)
t9_4653 = r"^[ghi][mno][jkl][def]$"
matches = [w for w in wordlist if re.search(t9_4653, w)]
print(matches)

['gold', 'golf', 'hold', 'hole']


In [28]:
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))

In [29]:
# + == 1 or more occurances
mine_closure = r"^m+i+n+e+$"
mine_matches = [w for w in chat_words if re.search(mine_closure, w)]
print(mine_matches)

['miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee', 'miiiiiinnnnnnnnnneeeeeeee', 'mine', 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee']


In [30]:
# * == 0 or more occurances
laughter = r"[ah]*(aha)+[ah]*"
laughter_matches = [w for w in chat_words if re.search(laughter, w)]
print(laughter_matches)

['Bwhaha', 'Haha', 'Hahaaaa', 'ahah', 'ahahah', 'ahhahahaha', 'bahahahaa', 'bwahahahahahahahahahaha', 'haha', 'hahaaa', 'hahah', 'hahaha', 'hahahaHA', 'hahahaa', 'hahahah', 'hahahaha', 'hahahahaaa', 'hahahahahaha', 'hahahahahahaha', 'hahahahahahahahahahahahahahahaha', 'hahahhahah', 'hahhahahaha', 'muhaha']


In [31]:
# [^_] == match any character except '_'
non_vowel = r"[^aeiouAEIOU]"
non_vowels = [w for w in chat_words if re.search(non_vowel, w)
              and w.isalpha()]
print(non_vowels[:10])

['ABOUT', 'ACTION', 'AFK', 'AGAIN', 'AHAHH', 'AHAHHA', 'AHHAH', 'AKDT', 'AKST', 'ALL']


In [32]:
wsj = sorted(set(nltk.corpus.treebank.words()))

In [33]:
# \ == treat next character as text (i.e. ignore 
# meta-character meaning)
decimal = r"^[0-9]*\.[0-9]+$"
decimals = [w for w in wsj if re.search(decimal, w)]
print(decimals[:10])

currency = r"^([A-Z]+\$)$"
currencies = [w for w in wsj if re.search(currency, w)]
print(currencies)

['0.0085', '0.05', '0.1', '0.16', '0.2', '0.25', '0.28', '0.3', '0.4', '0.5']
['C$', 'US$']


In [34]:
# {} == number of repeats of previous item
four_digit = r"^[0-9]{4}$"
four_dig_nums = [w for w in wsj if re.search(four_digit, w)]
print(four_dig_nums)

['1614', '1637', '1787', '1901', '1903', '1917', '1925', '1929', '1933', '1934', '1948', '1953', '1955', '1956', '1961', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1975', '1976', '1977', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2005', '2009', '2017', '2019', '2029', '3057', '8300']


In [35]:
# {n, m} == match 3-5 repetitions
num_hyph = r"^[0-9]+-[(a-z)(A-Z)]{3,7}$"
num_hyphs = [w for w in wsj if re.search(num_hyph, w)]
print(num_hyphs)

print('\n\n')
hyph = r"[(a-z)(A-Z)]+-[(a-z)(A-Z)]+-[(a-z)(A-Z)+]"
hyphs = [w for w in wsj if re.search(hyph, w)]
print(hyphs)

['10-day', '10-lap', '10-year', '100-share', '12-member', '12-point', '12-year', '14-hour', '15-day', '150-point', '190-point', '20-point', '20-stock', '21-month', '237-seat', '240-page', '27-year', '30-day', '30-minute', '30-point', '30-share', '30-year', '300-day', '36-day', '36-minute', '36-store', '42-year', '50-state', '500-Stock', '500-stock', '52-week', '520-lawyer', '69-point', '84-month', '87-store', '90-day']



['90-cent-an-hour', 'Hart-Scott-Rodino', 'Rent-A-Car', 'anti-morning-sickness', 'black-and-white', 'bread-and-butter', 'built-from-kit', 'cash-and-stock', 'cents-a-unit', 'computer-system-design', 'day-to-day', 'do-it-yourself', 'easy-to-read', 'father-in-law', 'four-foot-high', 'four-year-old', 'get-out-the-vote', 'larger-than-normal', 'less-than-brilliant', 'life-of-contract', 'machine-gun-toting', 'million-a-year', 'most-likely-successor', 'over-the-counter', 'red-and-white', 'savings-and-loan', 'search-and-seizure', 'seven-million-ton', 'tete-a-tete', 'triple-A-ra

In [36]:
# a | b == match a or b
ed_ing = r"[(a-z)(A-Z)]+(ed|ing|s)$"
ed_ing_matches = [w for w in wsj if re.search(ed_ing, w)]
print(ed_ing_matches[:10])



['62%-owned', 'ADRs', 'Absorbed', 'According', 'Adams', 'Adds', 'Adopting', 'Advanced', 'Advancing', 'Advocates']


<h1>3.5 Useful Applications of Regular Expressions</h1>

<h3>Extracting Word Pieces</h3>

In [37]:
# Find all non-overlapping matches of the given regular expression

# Find all vowells in word and count them
word = "supercalifragilisticexpialidocious"
vowels = r"[aeiou]"
matches = re.findall(vowels, word)
print(matches)
print(len(matches))

['u', 'e', 'a', 'i', 'a', 'i', 'i', 'i', 'e', 'i', 'a', 'i', 'o', 'i', 'o', 'u']
16


In [38]:
# Look for all sequences of 2 or more vowels in some text
# & determine relative frequency
vowel_seq = r"[(aeiou)(AEIOU)]{2,}"
freq_dist = nltk.FreqDist(vs for word in wsj
                             for vs in re.findall(vowel_seq, word))
print(freq_dist.most_common(12))

[('io', 549), ('ea', 476), ('ie', 331), ('ou', 329), ('ai', 261), ('ia', 253), ('ee', 217), ('oo', 174), ('ua', 109), ('au', 106), ('ue', 105), ('ui', 95)]


In [39]:
date = "2009-12-31"
pattern = r"[0-9]{2,4}"
to_list = [int(n) for n in re.findall(pattern, date)]
print(to_list)

[2009, 12, 31]


<h3>Doing More with Word Pieces</h3>

In [40]:
# <start with one or more vowels> | <one or more vowels at end> | <only vowel>
regexp = r"^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]"
english_udhr = nltk.corpus.udhr.words("English-Latin1")
print(english_udhr[:25])

def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)
    
print(nltk.tokenwrap(compress(w) for w in english_udhr[:75]))

['Universal', 'Declaration', 'of', 'Human', 'Rights', 'Preamble', 'Whereas', 'recognition', 'of', 'the', 'inherent', 'dignity', 'and', 'of', 'the', 'equal', 'and', 'inalienable', 'rights', 'of', 'all', 'members', 'of', 'the', 'human']
Unvrsl Dclrtn of Hmn Rghts Prmble Whrs rcgntn of the inhrnt dgnty and
of the eql and inlnble rghts of all mmbrs of the hmn fmly is the fndtn
of frdm , jstce and pce in the wrld , Whrs dsrgrd and cntmpt fr hmn
rghts hve rsltd in brbrs acts whch hve outrgd the cnscnce of mnknd ,
and the advnt of a wrld in whch hmn bngs shll enjy frdm of spch and


In [41]:
rotokas_words = nltk.corpus.toolbox.words("rotokas.dic")

In [42]:
# Extract all consonant-vowel sequences
cvs = [cv for w in rotokas_words for cv in
       re.findall(r"[ptksvr][aeiou]", w)]
cfd = nltk.ConditionalFreqDist(cvs)
cfd.tabulate()

    a   e   i   o   u 
k 418 148  94 420 173 
p  83  31 105  34  51 
r 187  63  84  89  79 
s   0   0 100   2   1 
t  47   8   0 148  37 
v  93  27 105  48  49 


In [43]:
# Inspect all words containing "su" and "po"
cv_pairs = [(cv, w) for w in rotokas_words
                    for cv in re.findall(r"[ptksvr][aeiou]", w)]
cv_index = nltk.Index(cv_pairs)
print(cv_index["su"])
print(cv_index["po"])

['kasuari']
['kaapo', 'kaapopato', 'kaipori', 'kaiporipie', 'kaiporivira', 'kapo', 'kapoa', 'kapokao', 'kapokapo', 'kapokapo', 'kapokapoa', 'kapokapoa', 'kapokapora', 'kapokapora', 'kapokaporo', 'kapokaporo', 'kapokari', 'kapokarito', 'kapokoa', 'kapoo', 'kapooto', 'kapoovira', 'kapopaa', 'kaporo', 'kaporo', 'kaporopa', 'kaporoto', 'kapoto', 'karokaropo', 'karopo', 'kepo', 'kepoi', 'keposi', 'kepoto']


<h3>Finding Word Stems</h3>

In [44]:
suffixes = ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']

def simple_stem(word):
    for suffix in suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

print("Right: ", simple_stem("dogs"))
print("Wrong: ", simple_stem("dogged"))

Right:  dog
Wrong:  dogg


In [45]:
suffix_re = r"^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$"
print("Suffix: ")
print(re.findall(suffix_re, "processing"))
print("\n\n")

stem_re = r"^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$"
print("Pair: ")
print(re.findall(stem_re, "processing"))

print("\n\n", "Flaw: ")
print(re.findall(stem_re, "processes"))

Suffix: 
['ing']



Pair: 
[('process', 'ing')]


 Flaw: 
[('processe', 's')]


In [46]:
# Non-greedy version of * operator and added "ged"
better_stem_re = r"^(.*?)(ing|ly|ged|ed|ious|ies|ive|es|s|ment)?$"
print(re.findall(better_stem_re, "processes"))
print(re.findall(better_stem_re, "dogged"))

solo = re.findall(better_stem_re, "language")
print("empty suffix: ", solo)

[('process', 'es')]
[('dog', 'ged')]
empty suffix:  [('language', '')]


In [47]:
def naive_stem(word):
#     print(word)
    regex = r"^(.*?)(ing|ly|ged|ed|ious|ies|ive|es|s|ment)?$"
    stem, suffix = re.findall(regex, word)[0]
    return stem

raw = """DENNIS: Listen, strange women lying in ponds distributing \
swords is no basis for a system of government.  Supreme executive \
power derives from a mandate from the masses, not from some farcical \
aquatic ceremony."""

tokens = word_tokenize(raw)
print([naive_stem(t) for t in tokens])

['DENNIS', ':', 'Listen', ',', 'strange', 'women', 'ly', 'in', 'pond', 'distribut', 'sword', 'i', 'no', 'basi', 'for', 'a', 'system', 'of', 'govern', '.', 'Supreme', 'execut', 'power', 'deriv', 'from', 'a', 'mandate', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']


<h3>Searching Tokenized Text</h3>

In [48]:
from nltk.corpus import gutenberg, nps_chat

moby = nltk.Text(gutenberg.words("melville-moby_dick.txt"))
chat = nltk.Text(nps_chat.words())


In [49]:
# 3 word phrases 'a _ man'
moby.findall(r"<a> (<.*>) <man>")
print("\n\n")

# 3 word phrases ending in bro
chat.findall(r"<.*> <.*> <bro>")

# 3 or more words starting with 'l'
print("\n\n")
chat.findall(r"<l.*>{3,}")

monied; nervous; dangerous; white; white; white; pious; queer; good;
mature; white; Cape; great; wise; wise; butterless; white; fiendish;
pale; furious; better; certain; complete; dismasted; younger; brave;
brave; brave; brave



you rule bro; telling you bro; u twizted bro



lol lol lol; lmao lol lol; lol lol lol; la la la la la; la la la; la
la la; lovely lol lol love; lol lol lol.; la la la; la la la


In [50]:
# Find hobbies
from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=["hobbies",
                                                    "learned"]))
hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")

speed and other activities; water and other liquids; tomb and other
landmarks; Statues and other monuments; pearls and other jewels;
charts and other items; roads and other features; figures and other
objects; military and other areas; demands and other factors;
abstracts and other compilations; iron and other metals


<h1>3.6 Normalizing Text</h1>

In [51]:
raw = """DENNIS: Listen, strange women lying in ponds distributing \
swords is no basis for a system of government.  Supreme executive \
power derives from a mandate from the masses, not from some farcical \
aquatic ceremony."""
tokens = word_tokenize(raw)


<h3>Stemmers</h3>

In [52]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
porter_ls = [porter.stem(t) for t in tokens]
lancaster_ls = [lancaster.stem(t) for t in tokens]
print(porter_ls)
print("\n\n")
print(lancaster_ls)

['denni', ':', 'listen', ',', 'strang', 'women', 'lie', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'basi', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'power', 'deriv', 'from', 'a', 'mandat', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcic', 'aquat', 'ceremoni', '.']



['den', ':', 'list', ',', 'strange', 'wom', 'lying', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'bas', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'pow', 'der', 'from', 'a', 'mand', 'from', 'the', 'mass', ',', 'not', 'from', 'som', 'farc', 'aqu', 'ceremony', '.']


<h3>Lemmatization</h3>

In [53]:
# Only removes/affixes if the resulting word is in its dictionary
wnl = nltk.WordNetLemmatizer()
lemmas = [wnl.lemmatize(t) for t in tokens]
print(lemmas)

['DENNIS', ':', 'Listen', ',', 'strange', 'woman', 'lying', 'in', 'pond', 'distributing', 'sword', 'is', 'no', 'basis', 'for', 'a', 'system', 'of', 'government', '.', 'Supreme', 'executive', 'power', 'derives', 'from', 'a', 'mandate', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']


<h3>Note:</h3>
<p>Another normalization task involves identifying non-standard words including numbers, abbreviations, and dates, and mapping any such tokens to a special vocabulary. For example, every decimal number could be mapped to a single token 0.0, and every acronym could be mapped to AAA. This keeps the vocabulary small and improves the accuracy of many language modeling tasks.</p>

<h1>3.7 Regular Expressions for Tokenizing Text</h1>

In [54]:
raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone
though), 'I won't have any pepper in my kitchen AT ALL. Soup does very
well without--Maybe it's always pepper that makes people hot-tempered,'..."""
print(raw)

'When I'M a Duchess,' she said to herself, (not in a very hopeful tone
though), 'I won't have any pepper in my kitchen AT ALL. Soup does very
well without--Maybe it's always pepper that makes people hot-tempered,'...


In [55]:
# Doesn't remove newlines or tabs
print(re.split(r" ", raw))

# Remove tabs/newlines
print("\n\n")
print(re.split(r"\s+", raw))

["'When", "I'M", 'a', "Duchess,'", 'she', 'said', 'to', 'herself,', '(not', 'in', 'a', 'very', 'hopeful', 'tone\nthough),', "'I", "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL.', 'Soup', 'does', 'very\nwell', 'without--Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', "hot-tempered,'..."]



["'When", "I'M", 'a', "Duchess,'", 'she', 'said', 'to', 'herself,', '(not', 'in', 'a', 'very', 'hopeful', 'tone', 'though),', "'I", "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL.', 'Soup', 'does', 'very', 'well', 'without--Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', "hot-tempered,'..."]


In [56]:
# Include punctuation
print(re.findall(r"\w+|\S\w*", raw))

print("\n\n")
# Get '(' out of tokens
print(re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", raw))

["'When", 'I', "'M", 'a', 'Duchess', ',', "'", 'she', 'said', 'to', 'herself', ',', '(not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', ')', ',', "'I", 'won', "'t", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', '.', 'Soup', 'does', 'very', 'well', 'without', '-', '-Maybe', 'it', "'s", 'always', 'pepper', 'that', 'makes', 'people', 'hot', '-tempered', ',', "'", '.', '.', '.']



["'", 'When', "I'M", 'a', 'Duchess', ',', "'", 'she', 'said', 'to', 'herself', ',', '(', 'not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', ')', ',', "'", 'I', "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', '.', 'Soup', 'does', 'very', 'well', 'without', '--', 'Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', 'hot-tempered', ',', "'", '...']


In [57]:

text = 'That U.S.A. poster-print costs $12.40...'
pattern = r'''(?x)            # set flag to allow verbose regexps
          ([A-Z]\.)+          # abbreviations, e.g. U.S.A.
          | \w+(-\w+)*        # words with optional internal hyphens
          | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
          | \.+               # ellipsis
          | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
          '''

nltk.regexp_tokenize(text, pattern)
['That', 'U.S.A.', 'poster-print', 'costs', '$12.40', '...']

['That', 'U.S.A.', 'poster-print', 'costs', '$12.40', '...']