In [73]:
# Import modules
from pyspark.ml import Pipeline
from pyspark.sql.types import IntegerType, FloatType
from pyspark.ml.feature import VectorAssembler, StringIndexer, IndexToString, SQLTransformer
from pyspark.ml.classification import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import udf
from pyspark.sql.types import LongType, BooleanType, DoubleType

In [74]:
# Define input path and constants
inputData = "/data/students/bigdata-01QYD/Lab9_DBD/Reviews.csv"

In [75]:
# Load the data
# Create a DataFrame from Reviews.csv
reviews = spark.read.load(inputData,\
                     format="csv",\
                     header=True,\
                     inferSchema=True)

In [76]:
reviews.printSchema()
#reviews.show()

root
 |-- Id: integer (nullable = true)
 |-- ProductId: string (nullable = true)
 |-- UserId: string (nullable = true)
 |-- ProfileName: string (nullable = true)
 |-- HelpfulnessNumerator: integer (nullable = true)
 |-- HelpfulnessDenominator: integer (nullable = true)
 |-- Score: integer (nullable = true)
 |-- Time: integer (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Text: string (nullable = true)



In [77]:
# Select only the records with HelpfulnessDenominator>0 (i.e., rated reviews)
reviewsWithVotes = reviews.filter("HelpfulnessDenominator>0")

In [78]:
# Create and compute the value of Column label for the selected rated reviews
def labelAttribute(HelpfulnessNumerator, HelpfulnessDenominator):
    if HelpfulnessNumerator/HelpfulnessDenominator>0.9:
        return 1.0
    else:
        return 0.0
    
    
spark.udf.register("labelAttribute", labelAttribute, DoubleType())    

<function __main__.labelAttribute(HelpfulnessNumerator, HelpfulnessDenominator)>

In [79]:
# Create the class attribute
# For this task, a review belongs to the “useful” class if its helpfulness index is above 90% (0.9).
reviewsLabelWithVotes = reviewsWithVotes\
.selectExpr("*", "labelAttribute(HelpfulnessNumerator, HelpfulnessDenominator) as label")

In [80]:
#reviewsLabelWithVotes.printSchema()
#reviewsLabelWithVotes.select("HelpfulnessNumerator", "HelpfulnessDenominator", "label").show()

In [81]:
# Split the dataframe with Column label in training and test set
(reviews_train, reviews_test) = reviewsLabelWithVotes.randomSplit([0.75,0.25], seed=10)

In [82]:
# Create/Define the preprocessing steps and the classification algorithm you want to use 
# and the content of the pipeline that is used to train the model on reviews_train and apply it on reviews_test

In [83]:
# In this solution we decided to use
# - The length of text
# - The number of words in text
# - The length of summary
# - The number of words in summary
# - The number of ! appearing in text
# - The number of ! appearing in  summary
# - The score assigned to the reviewed item in this review

In [84]:
# Define the function that is used to compute "The length of text"
spark.udf.register("lenText", lambda text: len(text), IntegerType())

<function __main__.<lambda>(text)>

In [85]:
# Define the function that is used to compute "The number of words in text"
spark.udf.register("numWordsText", lambda text: len(text.split(" ")), IntegerType())

<function __main__.<lambda>(text)>

In [86]:
# Define the function that is used to compute "The length of summary"
spark.udf.register("lenSummary", lambda summary: len(summary), IntegerType())

<function __main__.<lambda>(summary)>

In [87]:
# Define the function that is used to compute "The number of words in summary"
spark.udf.register("numWordsSummary", lambda summary: len(summary.split(" ")), IntegerType())

<function __main__.<lambda>(summary)>

In [88]:
# Define the function that is used to compute "The number of ! appearing in text"
spark.udf.register("numExclMark", lambda text: len(text.split("!")), IntegerType())

<function __main__.<lambda>(text)>

In [89]:
# Define the function that is used to compute "The number of ! appearing in summary"
spark.udf.register("numExclMarkSummary", lambda summary: len(summary.split("!")), IntegerType())

<function __main__.<lambda>(summary)>

In [90]:
# Define an SQLTranformer to create the columns we are interested in
sqlTrans = SQLTransformer(statement="""SELECT *, 
lenText(text) AS len,
numWordsText(text) AS numWords,
lenSummary(summary) AS lenS,
numWordsSummary(summary) AS numWordsS,
numExclMark(text) AS numExclMarks,
numExclMarkSummary(summary) AS numExclMarksS
FROM __THIS__""")

In [91]:
# For this simple solution features contain only len(Text)
assembler = VectorAssembler(inputCols=["len", "numWords", "lenS", "numWordsS",
                                      "numExclMarks", "numExclMarksS",
                                      "Score"], outputCol="features")

In [92]:
# Create a classification model based on the logistic regression algorithm
lr = LogisticRegression()

In [93]:
# Define the pipeline that is used to create the logistic regression
# model on the training data.
pipeline = Pipeline().setStages([sqlTrans, assembler,lr])

In [94]:
# Fit/Train the model
model = pipeline.fit(reviews_train)

In [95]:
# Apply the model on the test set
predictions = model.transform(reviews_test).cache()

In [96]:
# Compute statistics
# Accuracy, F1, weighted recall, weighted precision
evaluatorAcc = MulticlassClassificationEvaluator(labelCol="label" , predictionCol= "prediction", metricName = "accuracy")
evaluatorF1 = MulticlassClassificationEvaluator(labelCol="label" , predictionCol= "prediction", metricName = "f1")
evaluatorRecall = MulticlassClassificationEvaluator(labelCol="label" , predictionCol= "prediction", metricName = "weightedRecall")
evaluatorPrecision = MulticlassClassificationEvaluator(labelCol="label" , predictionCol= "prediction", metricName = "weightedPrecision")

print("Accuracy:", evaluatorAcc.evaluate(predictions))
print("F1:", evaluatorF1.evaluate(predictions))
print("Weighted Recall:", evaluatorRecall.evaluate(predictions))
print("Weighted Precision:", evaluatorPrecision.evaluate(predictions))

Accuracy: 0.7229953879763735
F1: 0.7076064281627216
Weighted Recall: 0.7229953879763735
Weighted Precision: 0.7167174875050183


In [97]:
#  Compute the confusion matrix
#                     Predicted  
#  Actual       Useful   Useless
#  Useful          A        B
#  Useless          C        D

A = predictions.filter("prediction=1 and label=1").count()
B = predictions.filter("prediction=0 and label=1").count()
C = predictions.filter("prediction=1 and label=0").count()
D = predictions.filter("prediction=0 and label=0").count()

print("                       Predicted")
print("  Actual \t Useful\tUseless")
print("  Useful \t "+str(A)+ "\t\t"+str(B))
print("  Useless \t "+str(C)+ "\t\t"+str(D))

                       Predicted
  Actual 	 Useful	Useless
  Useful 	 41162		5748
  Useless 	 14793		12451


In [98]:
# Precision and recall for the two classes
# Useful
if A+C==0:
    print("Precision(Useful): undefined")
else:
    print("Precision(Useful):"+str(A/(A+C)))
    
    
print("Recall(Useful):"+str(A/(A+B)))

# Useless 
if B+D==0:
    print("Precision(Useless): undefined")
else:
    print("Precision(Useless):"+str(D/(B+D)))
    
print("Recall(Useless):"+str(D/(C+D)))

Precision(Useful):0.7356268429988384
Recall(Useful):0.877467490940098
Precision(Useless):0.6841584702456179
Recall(Useless):0.45701805902217
