In [30]:
import pandas as pd
import numpy as np
import re
import os
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark.ml import Pipeline 
from pyspark.sql.functions import rand 
from pyspark.mllib.evaluation import MulticlassMetrics 
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer


sc = SparkContext.getOrCreate()
path="Imdb/"
positiveFiles = [x for x in os.listdir(path + "train/pos/") if x.endswith(".txt")]
negativeFiles = [x for x in os.listdir(path + "train/neg/") if x.endswith(".txt")]

In [31]:
posReviews, negReviews = [], []

for posfile in positiveFiles:
    with open(path + "train/pos/" + posfile, encoding= "latin1") as file:
        posReviews.append(file.read())
for negfile in negativeFiles:
    with open(path + "train/neg/" + negfile, encoding= "latin1") as file:
        negReviews.append(file.read())
print("done")

done


In [32]:
reviews = pd.concat([
    pd.DataFrame({"review":posReviews, "label":1, "file":positiveFiles}),
    pd.DataFrame({"review":negReviews, "label":0, "file":negativeFiles})
], ignore_index=True).sample(frac=1, random_state=5)

reviews.head()

mySchema = StructType([ StructField("file", StringType(), True)\
                       ,StructField("label", IntegerType(), True)\
                       ,StructField("review", StringType(), True)])

#convert the data to spark dataframe so that it can be split up and drop the file column 
reviews2 = spark.createDataFrame(reviews,schema=mySchema)
reviews2 = reviews2.drop("file")
(trainSet, validationSet, testSet) = reviews2.randomSplit([0.90, 0.05, 0.05], seed = 2000)

trainSet2= trainSet
validationSet2 = validationSet
testSet2 = testSet
print("done")

In [33]:


tokenizer = Tokenizer(inputCol="review", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) 
label_stringIdx = StringIndexer(inputCol = "label", outputCol = "label 2.o")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

pipelineFit = pipeline.fit(trainSet)
trainDF = pipelineFit.transform(trainSet)
validationDF = pipelineFit.transform(validationSet)
trainDF.show(5)

+-----+--------------------+--------------------+--------------------+--------------------+---------+
|label|              review|               words|                  tf|            features|label 2.o|
+-----+--------------------+--------------------+--------------------+--------------------+---------+
|    0|!!!!! POSSIBLE SP...|[!!!!!, possible,...|(65536,[732,991,1...|(65536,[732,991,1...|      0.0|
|    0|"A young woman un...|["a, young, woman...|(65536,[750,1217,...|(65536,[750,1217,...|      0.0|
|    0|"Black Angel" is ...|["black, angel", ...|(65536,[696,2888,...|(65536,[696,2888,...|      0.0|
|    0|"Fred Claus" some...|["fred, claus", s...|(65536,[14,61,315...|(65536,[14,61,315...|      0.0|
|    0|"Godzilla vs King...|["godzilla, vs, k...|(65536,[14,1114,1...|(65536,[14,1114,1...|      0.0|
+-----+--------------------+--------------------+--------------------+--------------------+---------+
only showing top 5 rows



In [34]:
lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(trainDF)
predictions = lrModel.transform(validationDF)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(validationDF.count())
accuracy

0.8666146645865834

In [35]:
from pyspark.ml.feature import CountVectorizer

tokenizer = Tokenizer(inputCol="review", outputCol="words")
cv = CountVectorizer(vocabSize=2**16, inputCol="words", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="features", minDocFreq=5) 
label_stringIdx = StringIndexer(inputCol = "label", outputCol = "label 2.0")
lr = LogisticRegression(maxIter=100)
pipeline = Pipeline(stages=[tokenizer, cv, idf, label_stringIdx, lr])

pipelineFit = pipeline.fit(trainSet2)
predictions = pipelineFit.transform(validationSet2)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(validationSet2.count())
roc_auc = evaluator.evaluate(predictions)

print ("Accuracy Score: " + str(accuracy))
print ("ROC-AUC: " + str(roc_auc))

Accuracy Score: 0.8783151326053042
ROC-AUC: 0.9342950906005575
