In [1]:
import nltk 

In [None]:
# get some text corpora
nltk.download()

In [2]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [3]:
# counting vocab of words
text7

<Text: Wall Street Journal>

In [4]:
print(sent7) 
print(len(sent7))
len(set(text7)) #unique count of words

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
18


12408

In [6]:
list(set(text7))[:10]
# u stands for utf-8

['relieve',
 'mortgage-based',
 '143.08',
 'Mary',
 '*T*-111',
 'breathed',
 'showing',
 'Duluth',
 'Barrels',
 'investor-relations']

In [17]:
# frequency of words
dist = FreqDist(text7)
print(len(dist))
vocab1 = dist.keys()
list(vocab1)[:10]
dist[u'four'] #find out how many times 'four' occurs

12408


20

In [18]:
#freq of the word with some condition
freqwords = [w for w in vocab1 if len(w)>5 and dist[w]>100]
freqwords

# the reason why we need restrict on the length of the words is 
# need to avoid the simple words like a, the, then, etc, these will be very frequent

['billion',
 'company',
 'president',
 'because',
 'market',
 'million',
 'shares',
 'trading',
 'program']

In [19]:
# normalization and stemming
# normalization
input1 = 'List listed lists listing listings'
words1 = input1.lower().split(' ')
words1

['list', 'listed', 'lists', 'listing', 'listings']

In [20]:
# stemming 
porter = nltk.PorterStemmer() #popular algo.
[porter.stem(t) for t in words1]
# 不一定要这样做，因为 listing and list have diff. meaning 
# but can do it on list and lists 

['list', 'list', 'list', 'list', 'list']

In [21]:
#lemmatization 
udhr = nltk.corpus.udhr.words('English-Latin1') #universal declaeation of human rights
udhr[:20]
[porter.stem(t) for t in udhr[:20]]


['univers',
 'declar',
 'of',
 'human',
 'right',
 'preambl',
 'wherea',
 'recognit',
 'of',
 'the',
 'inher',
 'digniti',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalien',
 'right',
 'of']

In [22]:
WNlemma = nltk.WordNetLemmatizer()
[WNlemma.lemmatize(t) for t in udhr[:20]]

['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'right',
 'of']

In [24]:
# tokenization
text11 = "Children shouldn't drink a sugary drink before bed."
text11.split(' ')

['Children', "shouldn't", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']

In [25]:
nltk.word_tokenize(text11) # n't negation is important 

['Children',
 'should',
 "n't",
 'drink',
 'a',
 'sugary',
 'drink',
 'before',
 'bed',
 '.']

In [26]:
# sentence splitting
text12 = "This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"
sentences = nltk.sent_tokenize(text12)
len(sentences)

4

In [27]:
sentences

['This is the first sentence.',
 'A gallon of milk in the U.S. costs $2.99.',
 'Is this the third sentence?',
 'Yes, it is!']

In [28]:
# POS tagging 
import nltk 
nltk.help.upenn_tagset('MD') # modal aux

MD: modal auxiliary
    can cannot could couldn't dare may might must need ought shall should
    shouldn't will would


In [32]:
text11 = "Children shouldn't drink a sugary drink before bed."
text13 = nltk.word_tokenize(text11) # n't negation is important 
nltk.pos_tag(text13) # thus we get all the part of speech 

[('Children', 'NNP'),
 ('should', 'MD'),
 ("n't", 'RB'),
 ('drink', 'VB'),
 ('a', 'DT'),
 ('sugary', 'JJ'),
 ('drink', 'NN'),
 ('before', 'IN'),
 ('bed', 'NN'),
 ('.', '.')]

In [37]:
# parsing sentence structure
text15 = nltk.word_tokenize('Alice loves Bob')
grammar = nltk.CFG.fromstring("""
    s -> NP VP
    VP -> V NP
    NP -> 'Alice'|'Bob'
    V -> 'loves'
""")

parser = nltk.ChartParser(grammar)
trees = parser.parse_all(text15)
for tree in trees:
    print(tree)

(s (NP Alice) (VP (V loves) (NP Bob)))


In [None]:
from nltk.corpus import treebank 

In [39]:
text18 = nltk.word_tokenize("The old man the boat")
nltk.pos_tag(text18)

[('The', 'DT'), ('old', 'JJ'), ('man', 'NN'), ('the', 'DT'), ('boat', 'NN')]