In [1]:
# Import modules
from pyspark.ml import Pipeline
from pyspark.sql.types import IntegerType, FloatType
from pyspark.ml.feature import VectorAssembler, StringIndexer, IndexToString, SQLTransformer
from pyspark.ml.classification import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import udf
from pyspark.sql.types import LongType, BooleanType, DoubleType

In [2]:
# Define input path and constants
inputData = "/data/students/bigdata-01QYD/Lab9_DBD/Reviews.csv"

In [3]:
# Load the data
# Create a DataFrame from Reviews.csv
reviews = spark.read.load(inputData,\
                     format="csv",\
                     header=True,\
                     inferSchema=True)

In [5]:
#reviews.printSchema()
#reviews.show()

In [6]:
# Select only the records with HelpfulnessDenominator>0 (i.e., rated reviews)
reviewsWithVotes = reviews.filter("HelpfulnessDenominator>0")

In [7]:
# Create and compute the value of Column label for the selected rated reviews
def labelAttribute(HelpfulnessNumerator, HelpfulnessDenominator):
    if HelpfulnessNumerator/HelpfulnessDenominator>0.9:
        return 1.0
    else:
        return 0.0
    
    
spark.udf.register("labelAttribute", labelAttribute, DoubleType())    

<function __main__.labelAttribute(HelpfulnessNumerator, HelpfulnessDenominator)>

In [8]:
# Create the class attribute
# For this task, a review belongs to the “useful” class if its helpfulness index is above 90% (0.9).
reviewsLabelWithVotes = reviewsWithVotes\
.selectExpr("*", "labelAttribute(HelpfulnessNumerator, HelpfulnessDenominator) as label")

In [10]:
#reviewsLabelWithVotes.printSchema()
#reviewsLabelWithVotes.select("HelpfulnessNumerator", "HelpfulnessDenominator", "label").show()

In [11]:
# Split the dataframe with Column label in training and test set
(reviews_train, reviews_test) = reviewsLabelWithVotes.randomSplit([0.75,0.25], seed=10)

In [12]:
# Create/Define the preprocessing steps and the classification algorithm you want to use 
# and the content of the pipeline that is used to train the model on reviews_train and apply it on reviews_test
# Implement a first solution with one single values in features: text length

In [16]:
# Define the transformer that is used to compure text length for each review
spark.udf.register("lenText", lambda text: len(text), IntegerType())

# Define an SQLTranformer to create the columns we are interested in and select only the lines with 
sqlTrans = SQLTransformer(statement="""SELECT *, 
lenText(text) AS len
FROM __THIS__""")

In [18]:
# For this simple solution features contains only len(Text)
assembler = VectorAssembler(inputCols=["len"], outputCol="features")

In [19]:
# Create a classification model based on the logistic regression algorithm
lr = LogisticRegression()

In [20]:
# Define the pipeline that is used to create the logistic regression
# model on the training data.
pipeline = Pipeline().setStages([sqlTrans, assembler,lr])

In [21]:
# Fit/Train the model
model = pipeline.fit(reviews_train)

In [22]:
# Apply the model on the test set
predictions = model.transform(reviews_test).cache()

In [23]:
# Compute statistics
# Accuracy, F1, weighted recall, weighted precision
evaluatorAcc = MulticlassClassificationEvaluator(labelCol="label" , predictionCol= "prediction", metricName = "accuracy")
evaluatorF1 = MulticlassClassificationEvaluator(labelCol="label" , predictionCol= "prediction", metricName = "f1")
evaluatorRecall = MulticlassClassificationEvaluator(labelCol="label" , predictionCol= "prediction", metricName = "weightedRecall")
evaluatorPrecision = MulticlassClassificationEvaluator(labelCol="label" , predictionCol= "prediction", metricName = "weightedPrecision")

print("Accuracy:", evaluatorAcc.evaluate(predictions))
print("F1:", evaluatorF1.evaluate(predictions))
print("Weighted Recall:", evaluatorRecall.evaluate(predictions))
print("Weighted Precision:", evaluatorPrecision.evaluate(predictions))

Accuracy: 0.6326024219866764
F1: 0.4902428404049922
Weighted Recall: 0.6326024219866764
Weighted Precision: 0.40018582430340893


In [24]:
#  Compute the confusion matrix
#                     Predicted  
#  Actual       Useful   Useless
#  Useful          A        B
#  Useless          C        D

A = predictions.filter("prediction=1 and label=1").count()
B = predictions.filter("prediction=0 and label=1").count()
C = predictions.filter("prediction=1 and label=0").count()
D = predictions.filter("prediction=0 and label=0").count()

print("                       Predicted")
print("  Actual \t Useful\tUseless")
print("  Useful \t "+str(A)+ "\t\t"+str(B))
print("  Useless \t "+str(C)+ "\t\t"+str(D))

                       Predicted
  Actual 	 Useful	Useless
  Useful 	 46910		0
  Useless 	 27244		0


In [26]:
# Precision and recall for the two classes
# Useful
if A+C==0:
    print("Precision(Useful): undefined")
else:
    print("Precision(Useful):"+str(A/(A+C)))
    
    
print("Recall(Useful):"+str(A/(A+B)))

# Useless 
if B+D==0:
    print("Precision(Useless): undefined")
else:
    print("Precision(Useless):"+str(D/(B+D)))
    
print("Recall(Useless):"+str(D/(C+D)))

Precision(Useful):0.6326024219866764
Recall(Useful):1.0
Precision(Useless): undefined
Recall(Useless):0.0
