# NLTK Chapter 3 Processing Raw Text by DT

In [1]:
import nltk, re, pprint
from nltk import word_tokenize

## Accessing Text from the Web and from Disk

### Electronic Books

In [2]:
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')
print(type(raw))
print(len(raw))
print(raw[1:74])

<class 'str'>
1176965
The Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky


In [3]:
url2 = "http://www.gutenberg.org/files/23864/23864-0.txt"
response2 = request.urlopen(url2)
raw2 = response2.read().decode('utf8')
print(type(raw2))
print(len(raw2))
print(raw2[572:600])

<class 'str'>
27317
孫子曰：兵者，國之大事，死生之地，存亡之道，不可不察也。


In [4]:
tokens = word_tokenize(raw)
print(type(tokens))
print(len(tokens))
print(tokens[1:12])

<class 'list'>
257726
['Project', 'Gutenberg', 'EBook', 'of', 'Crime', 'and', 'Punishment', ',', 'by', 'Fyodor', 'Dostoevsky']


In [5]:
text = nltk.Text(tokens)
print(type(text))
print(text[1021:1059])
print(text.collocations())

<class 'nltk.text.Text'>
['CHAPTER', 'I', 'On', 'an', 'exceptionally', 'hot', 'evening', 'early', 'in', 'July', 'a', 'young', 'man', 'came', 'out', 'of', 'the', 'garret', 'in', 'which', 'he', 'lodged', 'in', 'S.', 'Place', 'and', 'walked', 'slowly', ',', 'as', 'though', 'in', 'hesitation', ',', 'towards', 'K.', 'bridge', '.']
Katerina Ivanovna; Pyotr Petrovitch; Pulcheria Alexandrovna; Avdotya
Romanovna; Rodion Romanovitch; Marfa Petrovna; Sofya Semyonovna; old
woman; Project Gutenberg-tm; Porfiry Petrovitch; Amalia Ivanovna;
great deal; young man; Nikodim Fomitch; Ilya Petrovitch; Project
Gutenberg; Andrey Semyonovitch; Hay Market; Dmitri Prokofitch; Good
heavens
None


In [6]:
text2 = nltk.Text(raw2)
print(type(text2))
print(text2[572:600])
print(text2.collocations())

<class 'nltk.text.Text'>
['孫', '子', '曰', '：', '兵', '者', '，', '國', '之', '大', '事', '，', '死', '生', '之', '地', '，', '存', '亡', '之', '道', '，', '不', '可', '不', '察', '也', '。']

None


In [7]:
print(raw.find("PART I"))
print(raw.rfind("End of Project Gutenberg"))
print(raw.find("PART 0"))

5336
1157810
-1


In [8]:
print(raw2.find("孫子"))
print(raw2.rfind("三軍之所恃而動也"))

572
8174


### Dealing with HTML

In [9]:
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = request.urlopen(url).read().decode('utf8')
print(html[:60])

<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN


In [38]:
print(html[:100])

<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose


In [39]:
from bs4 import BeautifulSoup
raw = BeautifulSoup(html).get_text()
tokens = word_tokenize(raw)
print(tokens[:100])

['BBC', 'NEWS', '|', 'Health', '|', 'Blondes', "'to", 'die', 'out', 'in', '200', "years'", 'NEWS', 'SPORT', 'WEATHER', 'WORLD', 'SERVICE', 'A-Z', 'INDEX', 'SEARCH', 'You', 'are', 'in', ':', 'Health', 'News', 'Front', 'Page', 'Africa', 'Americas', 'Asia-Pacific', 'Europe', 'Middle', 'East', 'South', 'Asia', 'UK', 'Business', 'Entertainment', 'Science/Nature', 'Technology', 'Health', 'Medical', 'notes', '--', '--', '--', '--', '--', '--', '-', 'Talking', 'Point', '--', '--', '--', '--', '--', '--', '-', 'Country', 'Profiles', 'In', 'Depth', '--', '--', '--', '--', '--', '--', '-', 'Programmes', '--', '--', '--', '--', '--', '--', '-', 'SERVICES', 'Daily', 'E-mail', 'News', 'Ticker', 'Mobile/PDAs', '--', '--', '--', '--', '--', '--', '-', 'Text', 'Only', 'Feedback', 'Help', 'EDITIONS', 'Change', 'to', 'UK']




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html.parser")

  markup_type=markup_type))


In [12]:
tokens = tokens[110:390]
text = nltk.Text(tokens)
print(text.concordance('gene'))

Displaying 5 of 5 matches:
hey say too few people now carry the gene for blondes to last beyond the next 
blonde hair is caused by a recessive gene . In order for a child to have blond
 have blonde hair , it must have the gene on both sides of the family in the g
ere is a disadvantage of having that gene or by chance . They do n't disappear
des would disappear is if having the gene was a disadvantage and I do not thin
None


In [13]:
text2 = nltk.Text(raw2)
print(text2.concordance('孫子'))

No matches
None


### Processing Search Engine Results

In [14]:
print(text.collocations())

blonde hair; Jonathan Rees; n't disappear; blondes would; blondes may
None


### Processing RSS Feeds

In [40]:
import feedparser
llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")
print(llog['feed']['title'])
print(len(llog.entries))
post = llog.entries[2]
print(post.title)
content = post.content[0].value
print(content[:70])
raw = BeautifulSoup(content).get_text()
print(word_tokenize(raw[:100]))

Language Log
13
Accentuate the negative
<p>A curious case of a forced-choice sentence-completion question on a
['A', 'curious', 'case', 'of', 'a', 'forced-choice', 'sentence-completion', 'question', 'on', 'a', 'ninth-grade', 'exam', 'at', 'a', 'high', 'schoo']




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html.parser")

  markup_type=markup_type))


### Reading Local Files

In [16]:
import os
os.listdir('.')

['.ipynb_checkpoints',
 'langconv.py',
 'nltk-ch01.ipynb',
 'nltk-ch02.ipynb',
 'nltk-ch03.ipynb',
 'text_proc.py',
 'text_proc.pyc',
 'zh_wiki.py',
 '__pycache__']

In [17]:
f = open('text_proc.py')
raw = f.read()
print(raw)

def plural(word):
    if word.endswith('y'):
        return word[:-1] + 'ies'
    elif word[-1] in 'sx' or word[-2:] in ['sh', 'ch']:
        return word + 'es'
    elif word.endswith('an'):
        return word[:-2] + 'en'
    else:
        return word + 's'

def lower_word_set(text):
    return set(w.lower() for w in text if w.isalpha())

def diff_word_set(text, ref):
    t = lower_word_set(text)
    r = lower_word_set(ref)
    return t-r



In [18]:
f = open('text_proc.py', 'r')
for line in f:
    print(line.strip())

def plural(word):
if word.endswith('y'):
return word[:-1] + 'ies'
elif word[-1] in 'sx' or word[-2:] in ['sh', 'ch']:
return word + 'es'
elif word.endswith('an'):
return word[:-2] + 'en'
else:
return word + 's'

def lower_word_set(text):
return set(w.lower() for w in text if w.isalpha())

def diff_word_set(text, ref):
t = lower_word_set(text)
r = lower_word_set(ref)
return t-r


In [42]:
path = nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')
raw = open(path, 'r').read()
print(raw[:200])

[Moby Dick by Herman Melville 1851]


ETYMOLOGY.

(Supplied by a Late Consumptive Usher to a Grammar School)

The pale Usher--threadbare in coat, heart, body, and brain; I see him
now.  He was ever du


### Extracting Text from PDF, MSWord and other Binary Formats

### Capturing User Input

In [20]:
s = input("Enter some text: ")
print("You typed", len(word_tokenize(s)), "words.")

Enter some text: hello world
You typed 2 words.


### The NLP Pipeline

In [21]:
raw = open('text_proc.py').read()
print(type(raw))

<class 'str'>


In [22]:
tokens = word_tokenize(raw)
print(type(tokens))
words = [w.lower() for w in tokens]
print(type(words))
vocab = sorted(set(words))
print(type(vocab))

<class 'list'>
<class 'list'>
<class 'list'>


In [23]:
vocab.append('blog')

## Strings: Text Processing at the Lowest Level

### Basic Operations with Strings

In [24]:
monty = 'Monty Python'
print(monty)
circus = "Monty Python's Flying Circus"
print(circus)
circus = 'Monty Python\'s Flying Circus'
print(circus)

Monty Python
Monty Python's Flying Circus
Monty Python's Flying Circus


In [25]:
couplet = "Shall I compare thee to a Summer's day?"\
    "Thou are more lovely and more temperate:"
print(couplet)
couplet = ("Rough winds do shake the darling buds of May,"
    "And Summer's lease hath all too short a date:")
print(couplet)
couplet = """Shall I compare thee to a Summer's day?
    Thou are more lovely and more temperate:"""
print(couplet)
couplet = '''Rough winds do shake the darling buds of May,
    And Summer's lease hath all too short a date:'''
print(couplet)

Shall I compare thee to a Summer's day?Thou are more lovely and more temperate:
Rough winds do shake the darling buds of May,And Summer's lease hath all too short a date:
Shall I compare thee to a Summer's day?
    Thou are more lovely and more temperate:
Rough winds do shake the darling buds of May,
    And Summer's lease hath all too short a date:


In [26]:
print('very' + 'very' + 'very')
print('very' * 3)

veryveryvery
veryveryvery


### Printing Strings

In [27]:
print(monty)
grail = 'Holy Grail'
print(monty + grail)
print(monty, grail)
print(monty, "and the", grail)

Monty Python
Monty PythonHoly Grail
Monty Python Holy Grail
Monty Python and the Holy Grail


### Accessing Individual Characters

In [28]:
print(monty[0])
print(monty[3])
print(monty[5])
print(monty[-1])
print(monty[-7])

M
t
 
n
 


In [29]:
sent = 'colorless green ideas sleep furiously'
for char in sent:
    print(char, end=' ')

c o l o r l e s s   g r e e n   i d e a s   s l e e p   f u r i o u s l y 

In [30]:
from nltk.corpus import gutenberg
raw = gutenberg.raw('melville-moby_dick.txt')
fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())
print(fdist.most_common(5))
print([char for (char, count) in fdist.most_common()])

[('e', 117092), ('t', 87996), ('a', 77916), ('o', 69326), ('n', 65617)]
['e', 't', 'a', 'o', 'n', 'i', 's', 'h', 'r', 'l', 'd', 'u', 'm', 'c', 'w', 'f', 'g', 'p', 'b', 'y', 'v', 'k', 'q', 'j', 'x', 'z']


### Accessing Substrings

In [31]:
print(monty[6:10])
print(monty[-12:-7])
print(monty[:5])
print(monty[6:])

Pyth
Monty
Monty
Python


In [32]:
phrase = 'And now for something completely different'
if 'thing' in phrase:
    print('found "thing"')

found "thing"


In [33]:
 monty.find('Python')

6

### More operations on strings

In [43]:
# help(str)

### The Difference between Lists and Strings

In [35]:
query = 'Who knows?'
beatles = ['John', 'Paul', 'George', 'Ringo']
print(query[2])
print(beatles[2])
print(query[:2])
print(beatles[:2])
print(query + " I don't")
print(beatles + ['Brian'])

o
George
Wh
['John', 'Paul']
Who knows? I don't
['John', 'Paul', 'George', 'Ringo', 'Brian']


In [36]:
beatles[0] = "John Lennon"
del beatles[-1]
print(beatles)

['John Lennon', 'Paul', 'George']
