In [18]:
# Import modules
from pyspark.ml import Pipeline
from pyspark.sql.types import IntegerType, FloatType
from pyspark.ml.feature import VectorAssembler, StringIndexer, IndexToString, SQLTransformer
from pyspark.ml.classification import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import udf
from pyspark.sql.types import LongType, BooleanType, DoubleType
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import IDF

In [19]:
# Define input path and constants
inputData = "/data/students/bigdata-01QYD/Lab9_DBD/Reviews.csv"

In [20]:
# Load the data
# Create a DataFrame from Reviews.csv
reviews = spark.read.load(inputData,\
                     format="csv",\
                     header=True,\
                     inferSchema=True)

In [21]:
reviews.printSchema()
#reviews.show()

root
 |-- Id: integer (nullable = true)
 |-- ProductId: string (nullable = true)
 |-- UserId: string (nullable = true)
 |-- ProfileName: string (nullable = true)
 |-- HelpfulnessNumerator: integer (nullable = true)
 |-- HelpfulnessDenominator: integer (nullable = true)
 |-- Score: integer (nullable = true)
 |-- Time: integer (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Text: string (nullable = true)



In [22]:
# Select only the records with HelpfulnessDenominator>0 (i.e., rated reviews)
reviewsWithVotes = reviews.filter("HelpfulnessDenominator>0")

In [23]:
# Create and compute the value of Column label for the selected rated reviews
def labelAttribute(HelpfulnessNumerator, HelpfulnessDenominator):
    if HelpfulnessNumerator/HelpfulnessDenominator>0.9:
        return 1.0
    else:
        return 0.0
    
    
spark.udf.register("labelAttribute", labelAttribute, DoubleType())    

<function __main__.labelAttribute(HelpfulnessNumerator, HelpfulnessDenominator)>

In [24]:
# Create the class attribute
# For this task, a review belongs to the “useful” class if its helpfulness index is above 90% (0.9).
reviewsLabelWithVotes = reviewsWithVotes\
.selectExpr("*", "labelAttribute(HelpfulnessNumerator, HelpfulnessDenominator) as label")

In [25]:
#reviewsLabelWithVotes.printSchema()
#reviewsLabelWithVotes.select("HelpfulnessNumerator", "HelpfulnessDenominator", "label").show()

In [26]:
# Split the dataframe with Column label in training and test set
(reviews_train, reviews_test) = reviewsLabelWithVotes.randomSplit([0.75,0.25], seed=10)

In [27]:
# Create/Define the preprocessing steps and the classification algorithm you want to use 
# and the content of the pipeline that is used to train the model on reviews_train and apply it on reviews_test

In [28]:
# In this solution we decided to use the text field
# Configure a preprocessing phase fo the text field that consists of the following stages: 
# tokenizer -> split sentences in set of words
# remover -> remove stopwords
# hashingTF -> map set of words to a fixed-length feature vectors  (each 
# word becomes a feature and the value of the feature is the frequency of
#  the word in the sentence)
# idf -> compute the idf component of the TF-IDF measure

In [29]:
# The Tokenizer splits each sentence in a set of words.
# It analyzes the content of column "text" and adds the 
# new column "words" in the returned DataFrame
tokenizer = Tokenizer().setInputCol("Text").setOutputCol("words")

In [30]:
# Remove stopwords.
# The StopWordsRemover component returns a new DataFrame with 
# a new column called "filteredWords". "filteredWords" is generated 
# by removing the stopwords from the content of column "words" 
#remover = StopWordsRemover().setInputCol("words").setOutputCol("filteredWords")
remover = StopWordsRemover()\
.setInputCol("words")\
.setOutputCol("filteredWords")

In [31]:
# Map words to a features
# Each word in filteredWords must become a feature in a Vector object
# The HashingTF Transformer can be used to perform this operation.
# This operations is based on a hash function and can potentially 
# map two different words to the same "feature". The number of conflicts
# in influenced by the value of the numFeatures parameter.  
# The "feature" version of the words is stored in Column "rawFeatures". 
# Each feature, for a document, contains the number of occurrences 
# of that feature in the document (TF component of the TF-IDF measure) 
hashingTF = HashingTF()\
.setNumFeatures(1000)\
.setInputCol("filteredWords")\
.setOutputCol("rawFeatures")

In [32]:
# Apply the IDF transformation/computation.
# Update the weight associated with each feature by considering also the 
# inverse document frequency component. The returned new column 
# is called "features", that is the standard name for the column that 
# contains the  predictive features used to create a classification model 
idf = IDF()\
.setInputCol("rawFeatures")\
.setOutputCol("features")

In [33]:
# Create a classification model based on the decision tree algorithm
dt = DecisionTreeClassifier()

In [34]:
# Define the pipeline that is used to create the logistic regression
# model on the training data.
pipeline = Pipeline().setStages([tokenizer, remover, hashingTF, idf,dt])

In [35]:
# Fit/Train the model
model = pipeline.fit(reviews_train)

In [36]:
# Apply the model on the test set
predictions = model.transform(reviews_test).cache()

In [37]:
# Compute statistics
# Accuracy, F1, weighted recall, weighted precision
evaluatorAcc = MulticlassClassificationEvaluator(labelCol="label" , predictionCol= "prediction", metricName = "accuracy")
evaluatorF1 = MulticlassClassificationEvaluator(labelCol="label" , predictionCol= "prediction", metricName = "f1")
evaluatorRecall = MulticlassClassificationEvaluator(labelCol="label" , predictionCol= "prediction", metricName = "weightedRecall")
evaluatorPrecision = MulticlassClassificationEvaluator(labelCol="label" , predictionCol= "prediction", metricName = "weightedPrecision")

print("Accuracy:", evaluatorAcc.evaluate(predictions))
print("F1:", evaluatorF1.evaluate(predictions))
print("Weighted Recall:", evaluatorRecall.evaluate(predictions))
print("Weighted Precision:", evaluatorPrecision.evaluate(predictions))

Accuracy: 0.6374738264099966
F1: 0.5002284137449373
Weighted Recall: 0.6374738264099966
Weighted Precision: 0.6563767477929765


In [38]:
#  Compute the confusion matrix
#                     Predicted  
#  Actual       Useful   Useless
#  Useful          A        B
#  Useless          C        D

A = predictions.filter("prediction=1 and label=1").count()
B = predictions.filter("prediction=0 and label=1").count()
C = predictions.filter("prediction=1 and label=0").count()
D = predictions.filter("prediction=0 and label=0").count()

print("                       Predicted")
print("  Actual \t Useful\tUseless")
print("  Useful \t "+str(A)+ "\t\t"+str(B))
print("  Useless \t "+str(C)+ "\t\t"+str(D))

                       Predicted
  Actual 	 Useful	Useless
  Useful 	 46980		94
  Useless 	 26742		209


In [39]:
# Precision and recall for the two classes
# Useful
if A+C==0:
    print("Precision(Useful): undefined")
else:
    print("Precision(Useful):"+str(A/(A+C)))
    
    
print("Recall(Useful):"+str(A/(A+B)))

# Useless 
if B+D==0:
    print("Precision(Useless): undefined")
else:
    print("Precision(Useless):"+str(D/(B+D)))
    
print("Recall(Useless):"+str(D/(C+D)))

Precision(Useful):0.6372588915113535
Recall(Useful):0.9980031439860645
Precision(Useless):0.6897689768976898
Recall(Useless):0.007754814292605098
