In [2]:
import os
import numpy as np
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

filename = os.path.join("Data","Aula04","MovieReviews.tsv")
rawRDD = sc.textFile(filename,2)
header = rawRDD.take(1)[0]

dataRDD = rawRDD.filter(lambda x: x!=header)

weights = [.8, .1, .1]
seed = 42
rawTrainData, rawValidationData, rawTestData = dataRDD.randomSplit(weights, seed)
# Cache the data
rawTrainData.cache()
rawValidationData.cache()
rawTestData.cache()

'''
https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/

The sentiment labels are:

0 - negative
1 - somewhat negative
2 - neutral
3 - somewhat positive
4 - positive
'''

print header
print dataRDD.take(1)

PhraseId	SentenceId	Phrase	Sentiment
[u'1\t1\tA series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .\t1']


In [3]:
from collections import defaultdict
import hashlib

def hashFunction(numBuckets, rawFeats, printMapping=False):
    """Calculate a feature dictionary for an observation's features based on hashing.

    Note:
        Use printMapping=True for debug purposes and to better understand how the hashing works.

    Args:
        numBuckets (int): Number of buckets to use as features.
        rawFeats (list of (int, str)): A list of features for an observation.  Represented as
            (featureID, value) tuples.
        printMapping (bool, optional): If true, the mappings of featureString to index will be
            printed.

    Returns:
        dict of int to float:  The keys will be integers which represent the buckets that the
            features have been hashed to.  The value for a given key will contain the count of the
            (featureID, value) tuples that have hashed to that key.
    """
    mapping = {}
    for featureString in rawFeats.split():
        mapping[featureString] = int(int(hashlib.md5(featureString).hexdigest(), 16) % numBuckets)
    if(printMapping): print mapping
    sparseFeatures = defaultdict(float)
    for bucket in mapping.values():
        sparseFeatures[bucket] += 1.0
    return dict(sparseFeatures)

def parseTextPoint(point, numBuckets):
    id1, id2, text, sent = point.split('\t')
    sent = int(sent)
    if sent < 2:
        label = 0
    elif sent > 2:
        label = 1
    else:
        label = 2
    features = SparseVector(numBuckets,hashFunction(numBuckets, text))
    
    return LabeledPoint(label,features)

In [4]:
numBuckets = 5000
parsedTrainData = rawTrainData.map(lambda x: parseTextPoint(x, numBuckets)).cache()
parsedValData = rawValidationData.map(lambda x: parseTextPoint(x, numBuckets)).cache()
parsedTestData = rawTestData.map(lambda x: parseTextPoint(x, numBuckets)).cache()

binTrainData = parsedTrainData.filter(lambda x: x.label!=2).cache()
binValData = parsedValData.filter(lambda x: x.label!=2).cache()
binTestData = parsedTestData.filter(lambda x: x.label!=2).cache()

print parsedTrainData.take(1)

[LabeledPoint(0.0, (5000,[96,122,143,479,664,1138,1224,1351,1425,1497,1635,1648,1793,2084,2405,2539,2920,3214,3672,3804,3849,3876,4057,4345,4380,4675,4870,4993],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]))]


In [5]:
def calcAccuracy( predsAndVals ):
    return predsAndVals.map(lambda x: x[0]==x[1]).mean()

In [6]:
posClassCount = parsedTrainData.filter(lambda x: x.label==1).count()
negClassCount = parsedTrainData.filter(lambda x: x.label==0).count()
baseLine = 1 if posClassCount > negClassCount else 0

labelsAndPreds = binValData.map(lambda x: (baseLine,x.label))
accValBaseline = calcAccuracy(labelsAndPreds)
print  'Baseline Validation Accuracy = {0:.3f}\n'.format(accValBaseline)

Baseline Validation Accuracy = 0.542

