## Useful applications of Regular Expressions

In [1]:
import nltk, re, pprint
from nltk import word_tokenize #a word tokenizer
from urllib import request #read in file from url
from bs4 import BeautifulSoup #removing html from text
import feedparser #access content of a blog

In [3]:
word = 'supercalifragilisticexpialidocious'
len(re.findall(r"[aeiou]", word))


16

In [8]:
wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in wsj
                       for vs in re.findall(r'[aeiou]{2,}', word))
fd.most_common(12)

[('io', 549),
 ('ea', 476),
 ('ie', 331),
 ('ou', 329),
 ('ai', 261),
 ('ia', 253),
 ('ee', 217),
 ('oo', 174),
 ('ua', 109),
 ('au', 106),
 ('ue', 105),
 ('ui', 95)]

In [9]:
[int(n) for n in re.findall(r'[0-9]{2,4}', '2009-12-21')]

[2009, 12, 21]

In [17]:
# Stemming is a process of reducing a word to its word stem/root by stripping 
# affixes attached to it. 

re.findall(r'(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')

[('process', 'es')]

In [25]:
def stem(word):
    regex = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$'
    result, suffix = re.findall(regex, word)[0]
    return result

stem("processing")

'process'

In [22]:
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem

stem("processing")

'process'

In [32]:
stem("process")

'proces'

In [34]:
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem

raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government.  Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = word_tokenize(raw)
[stem(t) for t in tokens]

['DENNIS',
 ':',
 'Listen',
 ',',
 'strange',
 'women',
 'ly',
 'in',
 'pond',
 'distribut',
 'sword',
 'i',
 'no',
 'basi',
 'for',
 'a',
 'system',
 'of',
 'govern',
 '.',
 'Supreme',
 'execut',
 'power',
 'deriv',
 'from',
 'a',
 'mandate',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'some',
 'farcical',
 'aquatic',
 'ceremony',
 '.']

In [36]:
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem

sentences = "There are many women in the backyard, some are cooking others eating. Interesting"

tokens = word_tokenize(sentences)
[stem(t) for t in tokens]

['There',
 'are',
 'many',
 'women',
 'in',
 'the',
 'backyard',
 ',',
 'some',
 'are',
 'cook',
 'other',
 'eat',
 '.',
 'Interest']

In [37]:
from nltk.corpus import gutenberg, nps_chat
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
moby.findall(r"<a> (<.*>) <man>")

monied; nervous; dangerous; white; white; white; pious; queer; good;
mature; white; Cape; great; wise; wise; butterless; white; fiendish;
pale; furious; better; certain; complete; dismasted; younger; brave;
brave; brave; brave


In [38]:
chats = nltk.Text(nps_chat.words())
chats.findall(r"<.*> <.*> <bro>")

you rule bro; telling you bro; u twizted bro


In [39]:
chats.findall(r"<l.*>{3,}")

lol lol lol; lmao lol lol; lol lol lol; la la la la la; la la la; la
la la; lovely lol lol love; lol lol lol.; la la la; la la la


## 3.6 Text Normalization

### 3.6.1 Stemming


In [41]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
    is no basis for a system of government.  Supreme executive power derives from
    a mandate from the masses, not from some farcical aquatic ceremony."""

tokens = word_tokenize(raw)

## use Porter stemmer
porter = nltk.PorterStemmer()
[porter.stem(t) for t in tokens]

['denni',
 ':',
 'listen',
 ',',
 'strang',
 'women',
 'lie',
 'in',
 'pond',
 'distribut',
 'sword',
 'is',
 'no',
 'basi',
 'for',
 'a',
 'system',
 'of',
 'govern',
 '.',
 'suprem',
 'execut',
 'power',
 'deriv',
 'from',
 'a',
 'mandat',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'some',
 'farcic',
 'aquat',
 'ceremoni',
 '.']

In [42]:
lancaster = nltk.LancasterStemmer()
[lancaster.stem(t) for t in tokens]

['den',
 ':',
 'list',
 ',',
 'strange',
 'wom',
 'lying',
 'in',
 'pond',
 'distribut',
 'sword',
 'is',
 'no',
 'bas',
 'for',
 'a',
 'system',
 'of',
 'govern',
 '.',
 'suprem',
 'execut',
 'pow',
 'der',
 'from',
 'a',
 'mand',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'som',
 'farc',
 'aqu',
 'ceremony',
 '.']

3.6.2 Lemmatization

In [43]:
# lemmatization is a form of stemming but making sure the resulting stem is 
#found in the WordNet dictionary...

wnl = nltk.WordNetLemmatizer()
[wnl.lemmatize(t) for t in tokens]  # a good choice if you want a list of valid lemmas

['DENNIS',
 ':',
 'Listen',
 ',',
 'strange',
 'woman',
 'lying',
 'in',
 'pond',
 'distributing',
 'sword',
 'is',
 'no',
 'basis',
 'for',
 'a',
 'system',
 'of',
 'government',
 '.',
 'Supreme',
 'executive',
 'power',
 'derives',
 'from',
 'a',
 'mandate',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'some',
 'farcical',
 'aquatic',
 'ceremony',
 '.']