# Import Module

In [None]:
import nltk, random, json, collections, itertools
import nltk.classify.util, nltk.metrics

import operator as op
import numpy as np
import operator as op
import pandas as pd
import matplotlib.pyplot as plt

from __future__ import division
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, precision, recall, f_measure
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.util import mark_negation



# Functions  
A class and several classmethods are defined

In [None]:
class SentiClassifier:
    """This is the class of sentiment classifier, 
    the corpus used is movie_reviews from nltk.corpus that containing
    1000 negative reviews and 1000 postive reviews
    """
    def __init__(self, corpus=movie_reviews):
        self._corpus = corpus
        self._lemmatizer = WordNetLemmatizer()
       
    def preprocess(self, handle_negation=False, lemmatized=False):
        """
        inputs are two options: whether handle negation and whether
        lemmatize the words
        return documents as dictionary, the keys are the categories
        of the corpus, the length is equal to the number of the docs,
        and every doc is stored as a list of ``lemmatized`` words
        
        """
        documents = {}
        corpus = self._corpus
        cats = corpus.categories()
    
        fn = (lambda x : self._lemmatizer.lemmatize(x.lower())) if lemmatized else (lambda x : x.lower())
        
        #documents = {cat : [map(fn, corpus.words(fileids=[_])) for _ in corpus.fileids(cat)] for cat in cats}
        #(list comprehension is quicker than map)
        documents = {cat : [[fn(word) for word in corpus.words(fileids=[_])] for _ in corpus.fileids(cat)] 
                     for cat in cats}
        if handle_negation is True:
            #documents = {cat : map(mark_negation, documents[cat]) for cat in cats}
            documents = {cat: [mark_negation(document) for document in documents[cat]] for cat in cats}
        return documents
    
    def choose_features(self, documents, (toprank, n)=(False, None), 
                        score_fn=BigramAssocMeasures.likelihood_ratio):
        """
        This function provide certain number of feature based on the option.
        `toprank` is true when informative feature is preferred
        
        calculate potential informative feature ahead of training.
        scores assigned to every word is calculated by some statistical function.
        For more details, refer to documents of `BigramAssocMeasures`
        """
        cats = documents.keys()
        #words = {cat : reduce(op.add, documents[cat]) for cat in cats}
        # (use generator whenever possible, and don't use reduce and map if possible, two slow)
        words = {cat : list(itertools.chain.from_iterable(documents[cat])) for cat in cats}
        if toprank is False: return set(op.add(*words.values())[:n])
        
        word_freq = FreqDist(op.add(*words.itervalues()))
        word_label_freq = ConditionalFreqDist() 
        for cat in cats:
            word_label_freq[cat] = FreqDist(words[cat])
            
        
        num_words = {cat : word_label_freq[cat].N() for cat in cats}
        scores = {}
        for word, freq in word_freq.iteritems():
            pn_score = {cat : score_fn(word_label_freq[cat][word], 
                                    (freq, num_words[cat]), sum(num_words.values())) for cat in cats}
            scores[word] = sum(pn_score.values())
        nbestfeatures = map(op.itemgetter(0), 
                            sorted(scores.iteritems(), key=op.itemgetter(1), reverse=True))[:n]
        return set(nbestfeatures)  
    
    def extract_features(self, documents, features, unif_len=False, fuse=True):
        """
        `features in unigram model`
        feature name is the word, the value is true when the word is present in that doc,
        false otherwise. Specifically, this is the case when `unif_len` is true.
        (`unif_len` means every document has the feature of same length.
        By experimenting, it turns out the performance is usually worse when unif_len is true.)
        When `unif_len` is false, it only has true value, false value is considered as
        missing value in model training, a lot more details are explained in reports.
        if `fuse` is true, separate different categories, store docs in dict
        else, return a list ------------- related to evaluation methods
        """
        cats = documents.keys()
        if unif_len is True: 
            featuresets = {cat : [(dict([(word, word in set(document)) for word in features]), cat) 
                           for document in documents[cat]] for cat in cats}
        else: 
            featuresets = {cat : [(dict([(word, True) for word in set(document) if word in features]), cat)
                           for document in documents[cat]] for cat in cats}
    
        return op.add(*featuresets.values()) if fuse is True else featuresets
    
    def train(self, trainer, trainsets):
        return trainer.train(trainsets)
    
    
    def evaluation(self, classifier, testsets):
        """
        evaluation function for fused featuresets
        pay attention to difference from the evaluation_sepdocs
        this function evaluates the overall documents
        """    
        fvects, labels = zip(*testsets)
        predvals = classifier.classify_many(fvects)
        return confusion_matrix(np.array(predvals), np.array(labels))
    
    def cross_validation(self, trainer, featuresets, k_folds=5):
        """cross validation function
        inputs: 
            `trainer` is a classification model
            `featuresets` contains all feature set
        output: a confusion matrix"""
        conf_mats = np.zeros([k_folds, 2, 2])
        test_size = len(featuresets) // k_folds
        for i in xrange(k_folds):
            s, e = i*test_size, (i+1)*test_size
            testsets, trainsets = featuresets[s:e], cutlist(featuresets, s, e)
            classifier = senti_classifier.train(NaiveBayesClassifier, trainsets)
            conf_mats[i] = self.evaluation(classifier, testsets)
            #print "This is folder {}.\n performace is {}".format(i,metrics(conf_mats[i]))
        return conf_mats.sum(axis=0)
            
    def evaluation_sepdocs(self, classifier, testsets):
        """
        evaluation function for separate featuresets
        This evaluation function evaluates the result in separate
        categories. For example, neg and pos documents has their
        own evaluation. 
        """
        reference = collections.defaultdict(set)
        test = collections.defaultdict(set)
        
        for i, (featvec, trueval) in enumerate(testsets):
            reference[trueval].add(i)
            predval = classifier.classify(featvec)
            test[predval].add(i)
        
        performance = pd.DataFrame(data=.0, index=self._corpus.categories(), 
                                   columns=['Accuracy', 'Precision', 'Recall', 'F-Measure'])
        for idx in performance.index:
            performance.loc[idx, 'Accuracy'] = \
                len(set.union(*map(set.intersection, test.values(), reference.values()))) / len(testsets)
            performance.loc[idx, 'Precision'] = precision(reference[idx], test[idx])
            performance.loc[idx, 'Recall'] = recall(reference[idx], test[idx])
            performance.loc[idx, 'F-Measure'] = f_measure(reference[idx], test[idx])
        
        return performance

######################
# utility functions  #
######################



def metrics(conf_mat):
    """
    return a tuple: (accuracy, precision, recall, f-measure)
    accuracy (tp + tn) / (tp + fn + fp + tn)
    precision tp / (tp + fp)
    recall tp / (tp + fn)
    f-measure 1.0 / (alpha / p + (1-alpha) / r), here we set alpha = 0.5
    """
    accuracy = conf_mat.diagonal().sum() / conf_mat.sum()
    details = {}
    for i, label in enumerate(['pos','neg']):
        p = conf_mat[i,i] / conf_mat[:,i].sum()
        r = conf_mat[i,i] / conf_mat[i,:].sum()
        f_measure = 2 * (r*p) / (r+p) 
        details[label] = np.array((p, r, f_measure))
       
    details['avg'] = (details['pos'] + details['neg']) / 2
    return accuracy, details
    
def confusion_matrix(predvals, labels):
    """
                predicted values
             ------------------
            |     pos  |   neg |
      true  |----------------- |
    values  |pos| tp   |  fn   |
            |------------------|
            |neg| fp   |  tn   |
            --------------------
    """
    conf_mat = np.zeros((2,2))
    # true positive
    conf_mat[0,0] = (np.logical_and(labels==u'pos', predvals==u'pos')).sum()
    # false negative
    conf_mat[0,1] = (np.logical_and(labels==u'pos', predvals==u'neg')).sum()
    # false positive
    conf_mat[1,0] = (np.logical_and(labels==u'neg', predvals==u'pos')).sum()
    # true negative
    conf_mat[1,1] = (np.logical_and(labels==u'neg', predvals==u'neg')).sum()
    return conf_mat

def cutlist(seq, s, e):
    return seq[:s] + seq[e:]


# Script 
(explore the promising feature sets by changing parameters)  
parameters you may want to change:  
  
*handle_negation*: True if negation handling is desired  
*lemmatized*: True if lemmatization is desired  
*score_fn*: three functions discussed in report likelihood_ratio, chi_sq, phi_sq,   
*toprank, n*: toprank is True if more informative features are desired, n is the feature words to use,   
*uni_len*: True if uniform feature is desired  
*cutoff*: the value is equal to #(training instances) / #(all instances)

In [None]:
%%time

senti_classifier = SentiClassifier()
documents = senti_classifier.preprocess(handle_negation=True,lemmatized=False)
toprank, n = (True, 5000)
nbestfeatures = senti_classifier.choose_features(documents, (toprank, n), \
                                                       score_fn=BigramAssocMeasures.likelihood_ratio)
featuresets = senti_classifier.extract_features(documents, nbestfeatures, unif_len=False)
random.seed(78)
random.shuffle(featuresets)
cutoff = len(featuresets) // 5
trainsets, testsets = featuresets[cutoff:], featuresets[:cutoff]
trainer = NaiveBayesClassifier
classifier = senti_classifier.train(trainer, trainsets)

print metrics(senti_classifier.evaluation(classifier, testsets))
print metrics(senti_classifier.cross_validation(classifier, featuresets))


# Experiment (on Influence of length of Feature sets)  
You may want to change *ss*, *end*, *ss*  


In [None]:
senti_classifier = SentiClassifier()
documents = senti_classifier.preprocess(handle_negation=True,lemmatized=False)
results = []

s = 500
end = 20001
ss = 500

for i in xrange(s, end, ss):
    
    nbestfeatures = senti_classifier.choose_features(documents, (True, i), \
                                                           score_fn=BigramAssocMeasures.likelihood_ratio)
    featuresets = senti_classifier.extract_features(documents, nbestfeatures, unif_len=False)
    random.seed(78)
    random.shuffle(featuresets)
    cutoff = len(featuresets) // 5
    trainsets, testsets = featuresets[cutoff:], featuresets[:cutoff]
    trainer = NaiveBayesClassifier
    classifier = senti_classifier.train(trainer, trainsets)

    #print metrics(senti_classifier.evaluation(classifier, testsets))
    result = metrics(senti_classifier.cross_validation(classifier, featuresets))
    results.append(result)
    print result

# Plot 

In [None]:
# plot the results from previous cell. So make sure you run the previous cell before running this
accuracies, details = zip(*results)

avgs = np.array([detail['avg'] for detail in details])
negs = np.array([detail['neg'] for detail in details])
poss = np.array([detail['pos'] for detail in details])

%matplotlib inline

number = np.arange(500,20001,500)
f_avg = plt.figure()
plt.plot(number, avgs[:,0],'r-x', label='precision')
plt.plot(number, avgs[:,1], 'c-^',label='recall')
plt.plot(number, avgs[:,2], 'k-.',label='f-measure')
plt.axis([0,20000,0.8,0.97])
plt.title('Averaged Performance versus Number of Features')
plt.legend()
plt.xlabel('Number of Features')
plt.ylabel('Performance')
plt.savefig('avg.png',format='png', dpi=1000)

f_neg = plt.figure()
plt.plot(number, negs[:,0],'r-x', label='precision')
plt.plot(number, negs[:,1], 'c-^',label='recall')
plt.plot(number, negs[:,2], 'k-.',label='f-measure')
plt.axis([0,20000,0.74,1])
plt.title('Performance on Negative Class versus Number of Features')
plt.legend()
plt.xlabel('Number of Features')
plt.ylabel('Performance')
plt.savefig('neg.png',format='png', dpi=1000)

f_neg = plt.figure()
plt.plot(number, poss[:,0],'r-x', label='precision')
plt.plot(number, poss[:,1], 'c-^',label='recall')
plt.plot(number, poss[:,2], 'k-.',label='f-measure')
plt.axis([0,20000,0.74,1])
plt.title('Performance on Positive Class versus Number of Features')
plt.legend()
plt.xlabel('Number of Features')
plt.ylabel('Performance')
plt.savefig('pos.png',format='png', dpi=1000)

f_neg = plt.figure()
plt.plot(number, accuracies,'g-.')

plt.axis([0,20000,0.8,0.97])
plt.title('Overall Accuracy versus Number of Features')
plt.xlabel('Number of Features')
plt.ylabel('Performance')
plt.savefig('accuracy.png',format='png', dpi=1000)