In [57]:
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from nltk import PorterStemmer
from nltk.corpus import stopwords
import re

In [58]:
stop_words = stopwords.words('english')

In [59]:
def parse(line):
    line = re.sub('[:][)]', ' happy ', line)
    line = re.sub('[:][-][)]', ' happy ', line)
    line = re.sub('[:][(]', ' sad ', line)
    line = re.sub('[:][-][(]', ' sad ', line)
    line = re.sub('[^A-Za-z]', ' ', line)
    line = line.split()
    line = [word.strip().lower() for word in line if word not in stop_words if len(word) >= 3]
    line = [PorterStemmer().stem(word) for word in line]
    return line

In [60]:
pData = sc.textFile("C:\Users\KARAN\Documents\BDA\data_pos.txt")
ptData = pData.map(lambda text : LabeledPoint(1, HashingTF().transform(parse(text))))
print "No. of Positive Sentences: " + str(ptData.count())
ptData.persist()

No. of Positive Sentences: 24003


PythonRDD[3] at RDD at PythonRDD.scala:43

In [64]:
nData = sc.textFile("C:\Users\KARAN\Documents\BDA\data_neg.txt")
ntData = nData.map(lambda text : LabeledPoint(0, HashingTF().transform(parse(text))))
print "No. of Negative Sentences: " + str(ntData.count())
ntData.persist()

No. of Negative Sentences: 16279


PythonRDD[19] at RDD at PythonRDD.scala:43

In [68]:
ptrain, ptest = ptData.randomSplit([0.6, 0.4])
ntrain, ntest = ntData.randomSplit([0.6, 0.4])
trainh = ptrain.union(ntrain)
testh = ptest.union(ntest)
print "No. of Training Data: " + str(trainh.count())
print "No. of Testing Data: " + str(testh.count())

No. of Training Data: 24162
No. of Testing Data: 16120


In [69]:
model = LogisticRegressionWithLBFGS.train(trainh)

In [73]:
prediction_and_labels = testh.map(lambda point: (model.predict(point.features), point.label))
correct = prediction_and_labels.filter(lambda (predicted, actual): predicted == actual)
accuracy = correct.count() / float(testh.count())
print "Classifier correctly predicted category " + str(accuracy * 100) + " percent of the time"

Classifier correctly predicted category 82.0037220844 percent of the time


In [None]:
allData = sc.textFile("C:\Users\KARAN\Documents\BDA\data_all.txt")
allh = allData.map(lambda text : LabeledPoint(1, HashingTF().transform(parse(text))))
print "Total number of Sentences: " + str(allh.count())
allh.persist()

In [78]:
prediction_and_labels = allh.map(lambda point: (model.predict(point.features), point.label))

def toCSVLine(data):
    return data[0]

lines = prediction_and_labels.map(toCSVLine)
lines.saveAsTextFile('C:\Users\KARAN\Documents\BDA\out_lr.csv')

No. of Testing Data: 16120
