In [1]:
import sys, os
from collections import Counter
import numpy as np

""" Binary classification with the perceptron

Data: Cornell movie review polarity dataset (http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz)

"""

' Binary classification with the perceptron\n\nData: Cornell movie review polarity dataset (http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz)\n\n'

In [2]:
# represent text as a set of features
def featurizer(data):
    # movie data is already tokenized
    tokens = data.split(" ")
    counter=Counter()
    
    # add all features here
    counter+=unigramFeatures(tokens)
    counter+=biasTerm()

    return counter

In [3]:
# binary indicators for all words present in text
def unigramFeatures(tokens):
    counter=Counter()
    for t in tokens:
        # binary indicators
        counter["UNIGRAM:%s" % t]=1
    return counter

In [4]:
# bias term
def biasTerm():
    counter=Counter()
    counter["BIAS"]=1
    return counter

In [5]:
# for an input directory with {pos,neg} subdirectories, read through each file and tranform into a set of features;
# split all data into 90% training and 10% development

def getData(directory):
    # observation parameters (minimum count for a word to be a feature, max number of total features)
    maxVocab=10000
    minCount=3

    docs = {}
    labels = {}
    totalCounts=Counter()

    featureHash={}
    featureNames=[]

    # read training data and get feature counts and labels for all documents
    for label in ['pos', 'neg']:
        toppath = os.path.join(directory, label)
        for filename in os.listdir(toppath):
            #print filename
            path = os.path.join(toppath, filename)
            data = open(path).read().lower()
            counter=featurizer(data)
            totalCounts+=counter
            docs[filename] = counter
            labels[filename]=label	

    # set the feature featureHash
    featureCount=0
    for (word, count) in totalCounts.most_common(maxVocab):
        if count >= minCount:
            featureHash[word]=featureCount
            featureNames.append(word)
            featureCount+=1
        else:
            break

    numericDocs={}
    for filename in docs:
        numericFeats={}
        for w in docs[filename]:
            if w in featureHash:
                numericFeats[featureHash[w]]=1
        numericDocs[filename]=numericFeats

    train={}
    dev={}

    # split the data into 90% training, 10% development
    i=0
    for filename in numericDocs:
        if i % 10 == 9:
            dev[filename]=numericDocs[filename]
        else:
            train[filename]=numericDocs[filename]
        i+=1

    return (train, dev, featureNames, labels)

In [16]:
def linearKernel(x1, x2):
    val=0.
    for x in x1:
        if x in x2:
            val+=x1[x]*x2[x]

    return val

In [7]:
if __name__ == '__main__':

    # path to input directory containing training data
#    directory="../data/movie_reviews/txt_sentoken"
    directory="../data/movie_reviews/sample100"

In [8]:
    # train and dev are both maps from filename -> dict of feature ids/values
    # featurenNames = array of feature names indexed by feature id
    (train, dev, featureNames, labels) = getData(directory)

In [9]:
    F=len(featureNames)
    alphas={}

    values={}
    values["pos"]=1
    values["neg"]=-1

    eta=1.0
    trainN=len(train)
    devN=len(dev)

In [18]:
    for filename in train:
        alphas[filename]=0

    for i in range(10):

        # train perceptron on training splits

        incorrect=0.
        n=0
        for filename in train:
            n+=1
 
            val=0

            # calculate the dot product 
                        
            for comp in train:
                val+=alphas[comp]*values[labels[comp]]*linearKernel(train[filename], train[comp])

            # make prediction
            prediction=-1
            if val >= 0:
                prediction=1


            # update weights if incorrect prediction
            trueLabel=values[labels[filename]]
            if prediction != trueLabel:
                alphas[filename]+=1
                incorrect+=1


        trainingAcc=(trainN-incorrect)/trainN

        # evaluate perceptron on development splits

        incorrect=0.

        for filename in dev:
            val=0.
            
            for comp in train:
                val+=alphas[comp]*values[labels[comp]]*linearKernel(dev[filename], train[comp])
          
                
            prediction=-1
            
            if val >= 0:
                prediction=1
                
            trueLabel=values[labels[filename]]

            if prediction != trueLabel:
                incorrect+=1

        devAcc=(devN-incorrect)/devN

        print "training accuracy: %.5f (%s), development accuracy: %.5f (%s)" % (trainingAcc, trainN, devAcc, devN)

        # end training if perfect training accuracy
        if trainingAcc == 1:
            break


training accuracy: 0.54444 (180), development accuracy: 0.70000 (20)
training accuracy: 0.84444 (180), development accuracy: 0.90000 (20)
training accuracy: 0.98333 (180), development accuracy: 0.80000 (20)
training accuracy: 0.98889 (180), development accuracy: 0.80000 (20)
training accuracy: 0.98889 (180), development accuracy: 0.80000 (20)
training accuracy: 1.00000 (180), development accuracy: 0.80000 (20)
training accuracy: 1.00000 (180), development accuracy: 0.80000 (20)
training accuracy: 1.00000 (180), development accuracy: 0.80000 (20)
training accuracy: 1.00000 (180), development accuracy: 0.80000 (20)
training accuracy: 1.00000 (180), development accuracy: 0.80000 (20)
