In [1]:
import nltk
import numpy as np

In [2]:
from nltk.corpus import conll2000
test_sents = conll2000.chunked_sents('test.txt')
train_sents = conll2000.chunked_sents('train.txt')

In [3]:
class BigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.BigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [4]:
train_sents = conll2000.chunked_sents('train.txt')
unigram_chunker = BigramChunker(train_sents)

In [5]:
def get_chunks(sentence):
    words = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(words)
    tags = [tag for word, tag in tags]
    chunks = unigram_chunker.tagger.tag(tags)
    chunks = [(word,) + item for word, item in zip(words, chunks)]
    return chunks

In [6]:
get_chunks('remind me to call mohit tommorow')

[('remind', 'VB', 'B-VP'),
 ('me', 'PRP', 'B-NP'),
 ('to', 'TO', 'B-VP'),
 ('call', 'VB', 'I-VP'),
 ('mohit', 'NN', 'B-NP'),
 ('tommorow', 'NN', 'I-NP')]

In [7]:
get_chunks('set a timer for 10 minuutes')

[('set', 'VB', 'B-VP'),
 ('a', 'DT', 'B-NP'),
 ('timer', 'NN', 'I-NP'),
 ('for', 'IN', 'B-PP'),
 ('10', 'CD', 'B-NP'),
 ('minuutes', 'NNS', 'I-NP')]

In [8]:
get_chunks('set a reminder to call mohit tommorow')

[('set', 'VB', 'B-VP'),
 ('a', 'DT', 'B-NP'),
 ('reminder', 'NN', 'I-NP'),
 ('to', 'TO', 'B-VP'),
 ('call', 'VB', 'I-VP'),
 ('mohit', 'NN', 'B-NP'),
 ('tommorow', 'NN', 'I-NP')]