#                      Text Classification for NLP with NLTK

### Sequence Classification

In [2]:
from __future__ import division
import re

In [3]:
import nltk
from nltk.corpus import brown

In [20]:
def pos_features(sentence, i, history):
    features = {'suffix(1)': sentence[i][-1:],
               'suffix(2)': sentence[i][-2:],
               'suffix(3)': sentence[i][-3:]}
    
    if i == 0:
        features['prev_sentence'] = '<START>'
        features['prev_tag'] = '<START>'
    else:
        features['prev_sentence'] = sentence[i-1]
        features['prev_tag'] = history[i-1]
        
    return features


class ConsecutivePosTagger(nltk.TaggerI):
    
    def __init__ (self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                feature_set = pos_features(untagged_sent, i, history)
                train_set.append((feature_set, tag))
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)
    
    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            feature_set = pos_features(sentence, i, history)
            tag = self.classifier.classify(feature_set)
            history.append(tag)
        return zip(sentence, history)

In [22]:
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.2)
train_sets, test_sets = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_sets)
print(tagger.evaluate(test_sets))

In [8]:
ttg = brown.sents(categories='news')

In [15]:
print(pos_features(ttg[0], 2, ttg[0][2]))

{'prev_sentence': 'Fulton', 'prev_tag': 'o', 'suffix(2)': 'ty', 'suffix(1)': 'y', 'suffix(3)': 'nty'}


In [16]:
ttg

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]