## 3.1 Accessing text from web and from disk

### Dealing with HTML

In [10]:
import nltk, re, pprint
from nltk import word_tokenizekenize
from urllib import request

In [11]:
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = request.urlopen(url).read().decode('utf8')
html[:60]

'<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN'

In [12]:
from bs4 import BeautifulSoup
raw = BeautifulSoup(html).get_text()
tokens = word_tokenize(raw)

In [14]:
tokens = tokens[110:390]
text = nltk.Text(tokens)
text.concordance('gene')

Displaying 5 of 5 matches:
they say too few people now carry the gene for blondes to last beyond the next t
 blonde hair is caused by a recessive gene . In order for a child to have blonde
o have blonde hair , it must have the gene on both sides of the family in the gr
here is a disadvantage of having that gene or by chance . They do n't disappear 
ndes would disappear is if having the gene was a disadvantage and I do not think


### The NLP pipeline

In [15]:
raw = open('luckyyou.txt').read()
type(raw)

str

In [16]:
tokens = word_tokenize(raw)
type(tokens)

list

In [18]:
words = [w.lower() for w in tokens]
type(words)

list

In [19]:
vocab = sorted(set(words))
type(vocab)

list

## 3.4 Regular expressions for detecting word patterns

In [22]:
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]

In [24]:
[w for w in wordlist if re.search('^..j..t..$', w)]

['abjectly',
 'adjuster',
 'dejected',
 'dejectly',
 'injector',
 'majestic',
 'objectee',
 'objector',
 'rejecter',
 'rejector',
 'unjilted',
 'unjolted',
 'unjustly']

In [25]:
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]

['gold', 'golf', 'hold', 'hole']

## 3.5 Useful applications of regular expressions

### Extracting word pieces

In [26]:
word = 'supercalifragilisticexpialidocious'
re.findall(r'[aeiou]', word)

['u',
 'e',
 'a',
 'i',
 'a',
 'i',
 'i',
 'i',
 'e',
 'i',
 'a',
 'i',
 'o',
 'i',
 'o',
 'u']

In [27]:
len(re.findall(r'[aeiou]', word))

16

In [28]:
wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in wsj
                      for vs in re.findall(r'[aeiou]{2,}', word))
fd.most_common(12)

[('io', 549),
 ('ea', 476),
 ('ie', 331),
 ('ou', 329),
 ('ai', 261),
 ('ia', 253),
 ('ee', 217),
 ('oo', 174),
 ('ua', 109),
 ('au', 106),
 ('ue', 105),
 ('ui', 95)]