In [3]:
import nltk
from nltk.corpus import treebank
from nltk.grammar import Nonterminal

tree_sents = treebank.parsed_sents()

# split sentences into training and testing datasets
train_ratio = 0.9
train_size = int(len(tree_sents) * train_ratio)
tree_sents_train, tree_sents_test = tree_sents[:train_size], tree_sents[train_size:]

get_productions = lambda tree_sents : [p for sent in tree_sents for p in sent.productions()]

productions_train = get_productions(tree_sents_train)
productions_test = get_productions(tree_sents_test)

In [33]:
tree_sents[0].productions()

[S -> NP-SBJ VP .,
 NP-SBJ -> NP , ADJP ,,
 NP -> NNP NNP,
 NNP -> 'Pierre',
 NNP -> 'Vinken',
 , -> ',',
 ADJP -> NP JJ,
 NP -> CD NNS,
 CD -> '61',
 NNS -> 'years',
 JJ -> 'old',
 , -> ',',
 VP -> MD VP,
 MD -> 'will',
 VP -> VB NP PP-CLR NP-TMP,
 VB -> 'join',
 NP -> DT NN,
 DT -> 'the',
 NN -> 'board',
 PP-CLR -> IN NP,
 IN -> 'as',
 NP -> DT JJ NN,
 DT -> 'a',
 JJ -> 'nonexecutive',
 NN -> 'director',
 NP-TMP -> NNP CD,
 NNP -> 'Nov.',
 CD -> '29',
 . -> '.']

In [6]:
NP, VP = Nonterminal('NP'), Nonterminal('VP')

# define useful predicates for filtering productions
is_non_terminal = lambda p : p.is_nonlexical()
is_np = lambda p : p.lhs() == NP
is_vp = lambda p : p.lhs() == VP

is_rec_np = lambda p : is_np(p) and NP in p.rhs()

our_filter = lambda p : is_non_terminal(p) and not is_rec_np(p) and not is_vp(p)

productions_train = list(filter(our_filter, productions_train))
productions_test = list(filter(our_filter, productions_test))

In [22]:
p = productions_train[0]

In [7]:
# compute coverage
unique_productions_train = set(productions_train)
unique_productions_test  = set(productions_test)
unique_productions = unique_productions_train | unique_productions_test

coverage = 1. * len(unique_productions_train) / len(unique_productions)
coverage

0.9529152105940776

In [8]:
from nltk.grammar import PCFG, induce_pcfg
treebank_prob_grammar = induce_pcfg(Nonterminal('S'), productions_train)

# sort productions by probability of appearing in training
sorted_prod = sorted(treebank_prob_grammar.productions(), key=lambda p : p.prob(), reverse=True)

In [19]:
def no_check_coverage_at_all(tokens):
    pass
treebank_prob_grammar.check_coverage = no_check_coverage_at_all
treebank_prob_grammar.check_coverage(tags_as_sents_train[0])

In [20]:
from nltk.parse.viterbi import ViterbiParser
parser = ViterbiParser(treebank_prob_grammar)

In [21]:
tagged_sents = treebank.tagged_sents()
tagged_sents_train, tagged_sents_test = tagged_sents[:train_size], tagged_sents[train_size:]

In [115]:
tagged_sent_to_tags = lambda tagged_sent : tuple(zip(*tagged_sent))[1]
tags_as_sents_train = list(map(tagged_sent_to_tags, tagged_sents_train))
tags_as_sents_test = list(map(tagged_sent_to_tags, tagged_sents_test))

In [114]:
good_cnt, bad_cnt = 0, 0
predictions = []
for k, tags_as_sent in enumerate(tags_as_sents_test):
    print(k)
    try:
        predictions.append(list(parser.parse(tags_as_sent)))
        good_cnt += 1
    except ValueError:
        bad_cnt += 1

SyntaxError: invalid syntax (<ipython-input-114-348d8b944a39>, line 1)

In [110]:
sents = nltk.corpus.treebank_chunk.chunked_sents()
sents_train, sents_test = sents[:train_size], sents[train_size:]

In [80]:
class ChunkParser(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
        self.tagger = nltk.TrigramTagger(train_data)
    
    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence] 
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag) in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [111]:
chunker = ChunkParser(sents_train)

ValueError: Tree is too deeply nested to be printed in CoNLL format

In [99]:
seq = sents_test[0].leaves()
seq

[(u'The', u'DT'),
 (u'company', u'NN'),
 (u'forecast', u'VBD'),
 (u'that', u'IN'),
 (u'fourth-quarter', u'NN'),
 (u'income', u'NN'),
 (u'from', u'IN'),
 (u'continuing', u'VBG'),
 (u'operations', u'NNS'),
 (u'would', u'MD'),
 (u'be', u'VB'),
 (u'``', u'``'),
 (u'significantly', u'RB'),
 (u"''", u"''"),
 (u'lower', u'JJR'),
 (u'than', u'IN'),
 (u'a', u'DT'),
 (u'year', u'NN'),
 (u'earlier', u'JJR'),
 (u'.', u'.')]

In [100]:
res = chunker.parse(seq)

In [106]:
print chunker.evaluate(sents_test)

ChunkParse score:
    IOB Accuracy:  97.0%
    Precision:     91.1%
    Recall:        93.7%
    F-Measure:     92.4%


In [109]:
nltk.chunk.conlltags2tree?