# Natural Language Toolkit (NLTK)
NLTK -- the Natural Language Toolkit -- is a suite of open source Python modules, data sets and tutorials supporting research and development in Natural Language Processing.

http://www.nltk.org/ <br>
https://github.com/nltk/nltk

In [2]:
import nltk
from nltk.corpus import brown

# Corpus Readers

In [6]:
print brown.sents()[0]
print " ".join(brown.sents()[0])

[u'The', u'Fulton', u'County', u'Grand', u'Jury', u'said', u'Friday', u'an', u'investigation', u'of', u"Atlanta's", u'recent', u'primary', u'election', u'produced', u'``', u'no', u'evidence', u"''", u'that', u'any', u'irregularities', u'took', u'place', u'.']
The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .


In [8]:
"""
word_tokenize(s)
Tokenize a string
http://www.nltk.org/api/nltk.tokenize.html
"""

sentence = '''Good muffins cost $3.88\nin New York.  Please buy me two of them.\n\nThanks.'''
tokens = nltk.word_tokenize(sentence)
print tokens

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']


# nltk.tag.util

In [2]:
# http://www.nltk.org/_modules/nltk/tag/util.html
from nltk.tag.util import untag  # Untags a tagged sentence

In [5]:
"""
untag()
Given a tagged sentence, return an untagged version of that sentence.  I.e., return a list containing the first element
of each tuple in *tagged_sentence*.
"""

print " ".join(untag(brown.tagged_sents()[0]))

The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .


# nltk.tree

In [3]:
# http://www.nltk.org/_modules/nltk/tree.html
from nltk.tree import Tree

In [5]:
'''
        VP
      /   \
    VP    NP
    /    /  \
 Book   DT   NN
        |     |
      that  flight
'''

"""
Tree(label, children)
constructs a new tree with the specified label and list of children
"""
tree = Tree("VP", [Tree("VB", ["Book"]), Tree("NP", [Tree("DT", ["that"]), Tree("NN", ["flight"])])])
print "tree:", tree

for child in tree: print "child:", child

"""
height()
Return the height of the tree.
"""
print "tree.height(), tree[0].height(), tree[1].height():", tree.height(), tree[0].height(), tree[1].height()

"""
label()
Return the node label of the tree
"""
print "tree.label(), tree[0].label(), tree[1].label():", tree.label(), tree[0].label(), tree[1].label()

"""
leaves()
Return the leaves of the tree
"""
print "tree.leaves():", tree.leaves()

"""
subtrees(self, filter=None)
Generate all the subtrees of this tree, optionally restricted to trees matching the filter function
"""
for subtree in tree.subtrees(): print "subtree:", subtree

"""
productions()
Generate the productions that correspond to the non-terminal nodes of the tree.
For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the
form P -> C1 C2 ... Cn.
"""
for production in tree.productions(): print "production:", production

tree: (VP (VB Book) (NP (DT that) (NN flight)))
child: (VB Book)
child: (NP (DT that) (NN flight))
tree.height(), tree[0].height(), tree[1].height(): 4 2 3
tree.label(), tree[0].label(), tree[1].label(): VP VB NP
tree.leaves(): ['Book', 'that', 'flight']
subtree: (VP (VB Book) (NP (DT that) (NN flight)))
subtree: (VB Book)
subtree: (NP (DT that) (NN flight))
subtree: (DT that)
subtree: (NN flight)
production: VP -> VB NP
production: VB -> 'Book'
production: NP -> DT NN
production: DT -> 'that'
production: NN -> 'flight'
