In [17]:
import sys, os, nltk
from collections import Counter
from scipy import sparse
import numpy as np

""" Binary classification with the perceptron

Data: Cornell movie review polarity dataset (http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz)

"""

' Binary classification with the perceptron '

In [18]:
# represent text as a set of features
def featurizer(data):
    tokens = nltk.word_tokenize(data)
    counter=Counter()
    
    # add all features here
    counter+=unigramFeatures(tokens)
    counter+=biasTerm()

    return counter

In [19]:
# binary indicators for all words present in text
def unigramFeatures(tokens):
    counter=Counter()
    for t in tokens:
        # binary indicators
        counter["UNIGRAM:%s" % t]=1
    return counter

In [20]:
# bias term
def biasTerm():
    counter=Counter()
    counter["BIAS"]=1
    return counter

In [21]:
# for an input directory with {pos,neg} subdirectories, read through each file and tranform into a set of features;
# split all data into 90% training and 10% development

def getData(directory):
    # observation parameters (minimum count for a word to be a feature, max number of total features)
    maxVocab=10000
    minCount=3

    docs = {}
    labels = {}
    totalCounts=Counter()

    featureHash={}
    featureNames=[]

    # read training data and get feature counts and labels for all documents
    for label in ['pos', 'neg']:
        toppath = os.path.join(directory, label)
        for filename in os.listdir(toppath):
            #print filename
            path = os.path.join(toppath, filename)
            data = open(path).read().lower()
            counter=featurizer(data)
            totalCounts+=counter
            docs[filename] = counter
            labels[filename]=label	

    # set the feature featureHash
    featureCount=0
    for (word, count) in totalCounts.most_common(maxVocab):
        if count >= minCount:
            featureHash[word]=featureCount
            featureNames.append(word)
            featureCount+=1
        else:
            break

    numericDocs={}
    for filename in docs:
        numericFeats={}
        for w in docs[filename]:
            if w in featureHash:
                numericFeats[featureHash[w]]=1
        numericDocs[filename]=numericFeats

    train={}
    dev={}

    # split the data into 90% training, 10% development
    i=0
    for filename in numericDocs:
        if i % 10 == 9:
            dev[filename]=numericDocs[filename]
        else:
            train[filename]=numericDocs[filename]
        i+=1

    return (train, dev, featureNames, labels)

In [22]:
if __name__ == '__main__':

    # input directory containing training data

    #directory="/Users/dbamman/Downloads/review_polarity/bigtest"
    directory="/Users/dbamman/Downloads/review_polarity/txt_sentoken"

In [23]:
    # train and dev are both maps from filename -> dict of feature ids/values
    # featurenNames = array of feature names indexed by feature id
    (train, dev, featureNames, labels) = getData(directory)

In [33]:
    F=len(featureNames)
    betas=np.zeros(F)

    values={}
    values["pos"]=1
    values["neg"]=-1

    eta=1.0
    trainN=len(train)
    devN=len(dev)

In [34]:
    for i in range(100):

        # train perceptron on training splits

        incorrect=0.
        for filename in train:
            val=0

            # calculate the dot product 
            for i in train[filename]:
                val+=betas[i]*train[filename][i]

            # make prediction
            prediction=-1
            if val >= 0:
                prediction=1


            # update weights if incorrect prediction
            trueLabel=values[labels[filename]]
            if prediction != trueLabel:
                incorrect+=1
                for i in train[filename]:
                    betas[i]+=eta * trueLabel * train[filename][i]


        trainingAcc=(trainN-incorrect)/trainN

        # evaluate perceptron on development splits

        incorrect=0.

        for filename in dev:
            val=0.
            for i in dev[filename]:
                val+=betas[i]*dev[filename][i]
            prediction=-1
            
            if val >= 0:
                prediction=1
                
            trueLabel=values[labels[filename]]

            if prediction != trueLabel:
                incorrect+=1

        devAcc=(devN-incorrect)/devN

        print "training accuracy: %.5f (%s), development accuracy: %.5f (%s)" % (trainingAcc, trainN, devAcc, devN)

        # end training if perfect training accuracy
        if trainingAcc == 1:
            break


training accuracy: 0.72056 (1800), development accuracy: 0.80500 (200)
training accuracy: 0.91111 (1800), development accuracy: 0.81000 (200)
training accuracy: 0.96056 (1800), development accuracy: 0.80500 (200)
training accuracy: 0.96944 (1800), development accuracy: 0.83000 (200)
training accuracy: 0.98500 (1800), development accuracy: 0.83000 (200)
training accuracy: 0.99222 (1800), development accuracy: 0.81500 (200)
training accuracy: 0.99333 (1800), development accuracy: 0.83000 (200)
training accuracy: 0.99667 (1800), development accuracy: 0.83500 (200)
training accuracy: 1.00000 (1800), development accuracy: 0.83500 (200)


In [35]:
zipped=zip(betas, featureNames)			# zip two lists together to iterate through them simultaneously
zipped.sort(key = lambda t: t[0], reverse=True)		# sort the two lists by the values in the first (the coefficients)

In [36]:
print "MOST POSITIVE FEATURES:"
for (weight, word) in zipped[:10]:
    print "%.3f\t%s" % (weight, word)

print "\nMOST NEGATIVE FEATURES:"
for (weight, word) in zipped[:-10:-1]:
    print "%.3f\t%s" % (weight, word)



MOST POSITIVE FEATURES:
23.000	UNIGRAM:memorable
23.000	UNIGRAM:terrific
22.000	UNIGRAM:see
22.000	UNIGRAM:hilarious
21.000	UNIGRAM:best
21.000	UNIGRAM:both
21.000	UNIGRAM:change
21.000	UNIGRAM:flaws
20.000	UNIGRAM:many
20.000	UNIGRAM:less

MOST NEGATIVE FEATURES:
-35.000	UNIGRAM:bad
-32.000	UNIGRAM:nothing
-32.000	UNIGRAM:plot
-30.000	UNIGRAM:worst
-28.000	UNIGRAM:only
-27.000	UNIGRAM:boring
-25.000	UNIGRAM:unfortunately
-24.000	UNIGRAM:ridiculous
-24.000	UNIGRAM:script


In [42]:
# write weights

out=open("weights.txt", "w")
for (weight, word) in zipped:
    out.write("%.5f\t%s\n" % (weight, word))
out.close()