In [1]:
import pandas as pd
import numpy as np
import re
import os
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark.ml import Pipeline 
from pyspark.sql.functions import rand 
from pyspark.mllib.evaluation import MulticlassMetrics 
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer


sc = SparkContext.getOrCreate()
path="Imdb/"
positiveFiles = [x for x in os.listdir(path + "train/pos/") if x.endswith(".txt")]
negativeFiles = [x for x in os.listdir(path + "train/neg/") if x.endswith(".txt")]
print("done")

done


In [2]:
posReviews, negReviews = [], []

for posfile in positiveFiles:
    with open(path + "train/pos/" + posfile, encoding= "latin1") as file:
        posReviews.append(file.read())
for negfile in negativeFiles:
    with open(path + "train/neg/" + negfile, encoding= "latin1") as file:
        negReviews.append(file.read())
print("done")

done


In [4]:
reviews = pd.concat([
    pd.DataFrame({"review":posReviews, "label":1, "file":positiveFiles}),
    pd.DataFrame({"review":negReviews, "label":0, "file":negativeFiles})
], ignore_index=True).sample(frac=1, random_state=5)

reviews.head()

mySchema = StructType([ StructField("file", StringType(), True)\
                       ,StructField("label", IntegerType(), True)\
                       ,StructField("review", StringType(), True)])

#convert the data to spark dataframe so that it can be split up and drop the file column 
reviews2 = spark.createDataFrame(reviews,schema=mySchema)
reviews2 = reviews2.drop("file")
(trainSet, validationSet, testSet) = reviews2.randomSplit([0.90, 0.05, 0.05], seed = 2000)

# duplicates training and testing dat for another tokenizer we used
trainSet2= trainSet
validationSet2 = validationSet
testSet2 = testSet
print("done")

done


In [5]:
# tells our tokenizer which columns to use and output to  
tokenizer = Tokenizer(inputCol="review", outputCol="words")
# Takes a set of terms and turns them into features vectors. The feature vectors being the words 
# in our review sentences. 

hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
# Takes out all words that do not apear more than 5 times in the data 
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) 
# Tells the pipline what column to output the classification label to. 
label_stringIdx = StringIndexer(inputCol = "label", outputCol = "label 2.o")
# builds the pipeline with the parameters we just set up 
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

# Fits the model to data
# sets parameters to the model 
pipelineFit = pipeline.fit(trainSet)
# sets the data to the parameters 
trainDF = pipelineFit.transform(trainSet)
validationDF = pipelineFit.transform(validationSet)
trainDF.show(5)

+-----+--------------------+--------------------+--------------------+--------------------+---------+
|label|              review|               words|                  tf|            features|label 2.o|
+-----+--------------------+--------------------+--------------------+--------------------+---------+
|    0|!!!!! POSSIBLE SP...|[!!!!!, possible,...|(65536,[732,991,1...|(65536,[732,991,1...|      0.0|
|    0|" It had to be Yo...|[", it, had, to, ...|(65536,[14,338,10...|(65536,[14,338,10...|      0.0|
|    0|"54" is a film ba...|["54", is, a, fil...|(65536,[14,1020,1...|(65536,[14,1020,1...|      0.0|
|    0|"A Town Called He...|["a, town, called...|(65536,[356,731,1...|(65536,[356,731,1...|      0.0|
|    0|"A death at a col...|["a, death, at, a...|(65536,[543,1444,...|(65536,[543,1444,...|      0.0|
+-----+--------------------+--------------------+--------------------+--------------------+---------+
only showing top 5 rows



In [6]:
# Applying logistic regression to our model
lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(trainDF)
# Creates our predeictions and measures accuracy 
predictions = lrModel.transform(validationDF)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

0.92805132167358

In [8]:
from pyspark.ml.feature import CountVectorizer
# same as our above model except the use of CountVectorizer 
# 

cv = CountVectorizer(vocabSize=2**16, inputCol="words", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="features", minDocFreq=5) 

pipeline = Pipeline(stages=[tokenizer, cv, idf, label_stringIdx])

pipelineFit = pipeline.fit(trainSet2)
trainDF = pipelineFit.transform(trainSet2)
validationDF = pipelineFit.transform(validationSet2)
lrModel = lr.fit(trainDF)

predictions = lrModel.transform(validationDF)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

accuracy = evaluator.evaluate(predictions)
print(accuracy)


0.9405383468739532
