In [1]:
import sys, os
from collections import Counter
import operator
import numpy as np
from math import sqrt

""" Binary classification with the perceptron

Data: Cornell movie review polarity dataset (http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz)

"""

values={}
values["pos"]=1
values["neg"]=0

In [2]:
# represent text as a set of features
def featurizer(data):
    # movie data is already tokenized
    tokens = data.split(" ")
    counter=Counter()
    
    # add all features here
    counter+=unigramFeatures(tokens)

    return counter

In [3]:
# binary indicators for all words present in text
def unigramFeatures(tokens):
    counter=Counter()
    for t in tokens:
        # binary indicators
        counter["UNIGRAM:%s" % t]=1
    return counter

In [4]:
# for an input directory with {pos,neg} subdirectories, read through each file and tranform into a set of features;
# split all data into 90% training and 10% development

def getData(directory):
    # observation parameters (minimum count for a word to be a feature, max number of total features)
    maxVocab=10000
    minCount=3

    docs = {}
    labels = {}
    totalCounts=Counter()

    featureHash={}
    featureNames=[]

    # read training data and get feature counts and labels for all documents
    for label in ['pos', 'neg']:
        toppath = os.path.join(directory, label)
        for filename in os.listdir(toppath):
            #print filename
            path = os.path.join(toppath, filename)
            data = open(path).read().lower()
            counter=featurizer(data)
            totalCounts+=counter
            docs[filename] = counter
            labels[filename]=label	

    # set the feature featureHash
    featureCount=0
    for (word, count) in totalCounts.most_common(maxVocab):
        if count >= minCount:
            featureHash[word]=featureCount
            featureNames.append(word)
            featureCount+=1
        else:
            break

    numericDocs={}
    for filename in docs:
        numericFeats={}
        for w in docs[filename]:
            if w in featureHash:
                numericFeats[featureHash[w]]=1
        numericDocs[filename]=numericFeats

    train={}
    dev={}

    # split the data into 90% training, 10% development
    i=0
    for filename in numericDocs:
        if i % 10 == 9:
            dev[filename]=numericDocs[filename]
        else:
            train[filename]=numericDocs[filename]
        i+=1

    return (train, dev, featureNames, labels)

In [20]:
def sim(one, two):
    return jaccard(one,two)
#    return cosine(one,two)

In [6]:
def jaccard(one, two):
    onekeys=set(one.keys())
    twokeys=set(two.keys())
    return float(len(set.intersection(onekeys, twokeys)))/len(set.union(onekeys, twokeys))

In [7]:
def cosine(one, two):
    
    sim=0.
    norm1=0.
    norm2=0.
    for key in one:
        norm1+=one[key]*one[key]
    for key in two:
        norm2+=two[key]*two[key]
        
    for key in one:
        if key in two:
            sim+=one[key] * two[key]
    return sim/(sqrt(norm1) * sqrt(norm2))

In [8]:
def KNN(datapoint, train, labels, K):
    scores={}
    for trainingPoint in train:
        score=sim(datapoint, train[trainingPoint])
        scores[trainingPoint]=score
        
    sorted_x = sorted(scores.items(), key=operator.itemgetter(1), reverse=True)
   
    counts=np.zeros(2)
    
    if K >= len(scores):
        K=len(scores)
        
    for i in range(K):
        (k,v)=sorted_x[i]
        label=values[labels[k]]
        counts[label]+=1
    
    return np.argmax(counts)

In [17]:
if __name__ == '__main__':

    # path to input directory containing training data
    directory="../data/movie_reviews/sample100"

In [18]:
    # train and dev are both maps from filename -> dict of feature ids/values
    # featurenNames = array of feature names indexed by feature id
    (train, dev, featureNames, labels) = getData(directory)

In [21]:
correct=0
total=0

# K nearest neighbors
K=3

for filename in dev:
    prediction=KNN(dev[filename], train, labels, K)
                                 
    if prediction == values[labels[filename]]:
        correct+=1
    total+=1

    if total % 5 == 0:
        print ".",

print "\nAccuracy: %.3f, (%s/%s)" % (float(correct)/total, correct, total)



. . . . 
Accuracy: 0.850, (17/20)
