## Sequence classification

Consecutive classication or greedy classication: finding the most likely class label for the first input then use that answer to find the best label for the next input. The process is repeated until all inputs have been labelled. 

In [3]:
import nltk
from nltk.corpus import brown

In [5]:
def pos_features(sentence, i, history):
    """

    Function to extract features from a sentence. 

    History argument provides a list of tags that have already been predicted
    in the sentence so far. Each tag in history will correspond to a word in 
    sentence. 
    
    Function returns features extracted from each word of the sentence, i.e., 
    suffix1, suffix2, suffix3
    """
    features = {'suffix1': sentence[i][-1:], 
                'suffix2': sentence[i][-2:],
                'suffix3': sentence[i][-3:]}
    if i == 0:
        features['prev_word'] = '<START>'
        features['prev_tag'] = '<START>'

    else:
        features['prev_word'] = sentence[i-1]
        features['prev_tag'] = history[i-1]

    return features

class ConsecutivePosTagger(nltk.TaggerI):

    def __init(self, train_sents):

        """ 
        The function builds a sequence classifier based on the history of the 
        previous words
        """
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                feature_set = pos_features(untagged_sent, i, history)
                train_set.append((feature_set, tag))
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            feature_set = pos_features(sentence, i, history)
            tag = self.classifier.classify(feature_set)
            history.append(tag)
        return zip(sentence, history)

In [None]:
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_set, test_set = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_set)
print(tagger.evaluate(test_set))

# SPEECH AND LANGUAGE PROCESSING 3rd Edition