In [1]:
import sparknlp
import logging
sparknlp.start()
import numpy as np

from sparknlp import *
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col, udf
from pyspark.sql.types import *

from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.ml.regression import *
from pyspark.ml.classification import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from sklearn.ensemble import VotingClassifier



In [2]:
model_path = 'gs://msca-bdp-student-gcs/Group4_Project_Data/models/NB_apparel'
apparel_model = NaiveBayesModel.load(model_path)

                                                                                

In [3]:
model_path = 'gs://msca-bdp-student-gcs/Group4_Project_Data/models/NB_automotive'
automotive_model = NaiveBayesModel.load(model_path)

                                                                                

In [4]:
model_path = 'gs://msca-bdp-student-gcs/Group4_Project_Data/models/NB_beauty'
beauty_model = NaiveBayesModel.load(model_path)

23/03/04 20:29:15 WARN org.apache.hadoop.util.concurrent.ExecutorHelper: Thread (Thread[GetFileInfo #1,5,main]) interrupted: 
java.lang.InterruptedException
	at com.google.common.util.concurrent.AbstractFuture.get(AbstractFuture.java:510)
	at com.google.common.util.concurrent.FluentFuture$TrustedFuture.get(FluentFuture.java:88)
	at org.apache.hadoop.util.concurrent.ExecutorHelper.logThrowableFromAfterExecute(ExecutorHelper.java:48)
	at org.apache.hadoop.util.concurrent.HadoopThreadPoolExecutor.afterExecute(HadoopThreadPoolExecutor.java:90)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1157)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)


In [6]:
test_furniture = spark.read.format("csv").option("header", "true").\
                option("delimiter", "\t").load("gs://msca-bdp-student-gcs/Group4_Project_Data/amazon_reviews_us_Furniture_v1_00.tsv")

In [20]:
test_furniture.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: string (nullable = true)
 |-- total_votes: string (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: string (nullable = true)
 |-- sentiment: string (nullable = false)



In [7]:
tokenizer = Tokenizer(inputCol="review_body", outputCol="review_body_words")
remover = StopWordsRemover(inputCol="review_body_words", outputCol="review_body_words_filtered")
hashingTF = HashingTF(inputCol="review_body_words_filtered", outputCol="hashingTF_features")
idf = IDF(inputCol="hashingTF_features", outputCol="idf_features")
labelIndexer = StringIndexer(inputCol="sentiment", outputCol="sentiment_label")

pipeline = Pipeline(stages=[tokenizer,remover,hashingTF,idf,labelIndexer])

In [8]:
test_furniture =test_furniture.dropna().withColumn("star_rating",test_furniture.star_rating.cast('int')).withColumn('sentiment', when(col('star_rating') <= 3, 'negative').otherwise('positive'))

In [15]:
test_furniture.filter(col("review_id").like("R270VVWBBRDTAX")).filter(col("product_id").like("B003VL31N8")).select("review_id","product_id","helpful_votes","total_votes","verified_purchase","review_body","sentiment").show(truncate=False)

                                                                                

+--------------+----------+-------------+-----------+-----------------+-----------------------------------------------------+---------+
|review_id     |product_id|helpful_votes|total_votes|verified_purchase|review_body                                          |sentiment|
+--------------+----------+-------------+-----------+-----------------+-----------------------------------------------------+---------+
|R270VVWBBRDTAX|B003VL31N8|0            |0          |Y                |exactly what I wanted and fairly easy to put together|negative |
+--------------+----------+-------------+-----------+-----------------+-----------------------------------------------------+---------+



In [9]:
result_furniture = pipeline.fit(test_furniture).transform(test_furniture).select('review_id','review_body','product_id','product_title','sentiment_label','idf_features')

                                                                                

In [16]:
result_furniture.filter(col("review_id").like("R270VVWBBRDTAX")).filter(col("product_id").like("B003VL31N8")).show()

23/03/04 20:38:10 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/03/04 20:38:12 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB

+--------------+--------------------+----------+--------------------+---------------+--------------------+
|     review_id|         review_body|product_id|       product_title|sentiment_label|        idf_features|
+--------------+--------------------+----------+--------------------+---------------+--------------------+
|R270VVWBBRDTAX|exactly what I wa...|B003VL31N8|WOYBR TV-SW-TS 12...|            1.0|(262144,[51678,77...|
+--------------+--------------------+----------+--------------------+---------------+--------------------+



                                                                                

In [17]:
nb_predictions_automotive = automotive_model.transform(result_furniture)

In [19]:
nb_predictions_automotive.filter(col("review_id").like("R270VVWBBRDTAX")).filter(col("product_id").like("B003VL31N8")).show()

23/03/04 20:48:50 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.1 MiB
23/03/04 20:48:53 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.1 MiB

+--------------+--------------------+----------+--------------------+---------------+--------------------+--------------------+--------------------+----------+
|     review_id|         review_body|product_id|       product_title|sentiment_label|        idf_features|       rawPrediction|         probability|prediction|
+--------------+--------------------+----------+--------------------+---------------+--------------------+--------------------+--------------------+----------+
|R270VVWBBRDTAX|exactly what I wa...|B003VL31N8|WOYBR TV-SW-TS 12...|            1.0|(262144,[51678,77...|[-126.42424673990...|[0.99996624965134...|       0.0|
+--------------+--------------------+----------+--------------------+---------------+--------------------+--------------------+--------------------+----------+



                                                                                

In [45]:
nb_predictions_automotive.filter(col("sentiment_label")!=col("prediction")).show(30,truncate=False)

23/03/04 07:04:01 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.1 MiB


+--------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [37]:
nb_predictions_automotive.show(30)

23/03/04 06:40:23 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.1 MiB


+--------------+--------------------+----------+--------------------+---------------+--------------------+--------------------+--------------------+----------+
|     review_id|         review_body|product_id|       product_title|sentiment_label|        idf_features|       rawPrediction|         probability|prediction|
+--------------+--------------------+----------+--------------------+---------------+--------------------+--------------------+--------------------+----------+
|R3VR960AHLFKDV|This desk is very...|B004HB5E0E|Shoal Creek Compu...|            0.0|(262144,[34194,63...|[-351.64464552177...|[0.99999932324229...|       0.0|
|R16LGVMFKIUT0G|          Great item|B0042TNMMS|Dorel Home Produc...|            0.0|(262144,[245420,2...|[-33.810677314355...|[0.94501574520671...|       0.0|
|R1AIMEEPYHMOE4|Perfect fit for m...|B0030MPBZ4|Bathroom Vanity T...|            0.0|(262144,[3524,137...|[-289.21247830587...|[0.99999996967493...|       0.0|
|R1892CCSZWZ9SR|We use this on a ...|B00

In [40]:
nb_predictions_automotive.filter(col("review_id").like("R2JXCWADAWWAUJ")).show()

23/03/04 06:42:27 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.1 MiB
23/03/04 06:42:30 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.1 MiB

+--------------+--------------+----------+--------------------+---------------+--------------------+--------------------+--------------------+----------+
|     review_id|   review_body|product_id|       product_title|sentiment_label|        idf_features|       rawPrediction|         probability|prediction|
+--------------+--------------+----------+--------------------+---------------+--------------------+--------------------+--------------------+----------+
|R2JXCWADAWWAUJ|Saggy mattress|B00R6OX5MS|Sleep Master BiFo...|            1.0|(262144,[135048,2...|[-136.53052109610...|[0.99059898036372...|       0.0|
+--------------+--------------+----------+--------------------+---------------+--------------------+--------------------+--------------------+----------+



                                                                                

In [41]:
test_furniture.filter(col("review_id").like("R2JXCWADAWWAUJ")).select("review_id","product_id","star_rating","helpful_votes","total_votes","verified_purchase","review_body","sentiment").show()



+--------------+----------+-----------+-------------+-----------+-----------------+--------------+---------+
|     review_id|product_id|star_rating|helpful_votes|total_votes|verified_purchase|   review_body|sentiment|
+--------------+----------+-----------+-------------+-----------+-----------------+--------------+---------+
|R2JXCWADAWWAUJ|B00R6OX5MS|          1|            1|          4|                Y|Saggy mattress| negative|
+--------------+----------+-----------+-------------+-----------+-----------------+--------------+---------+



                                                                                

In [38]:
nb_predictions_automotive.filter(col("product_id").like("B003HEPHUE")).filter(col("review_id").like("R3SMFDAWU2V153")).show(truncate=False)

23/03/04 06:41:23 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.1 MiB
23/03/04 06:41:26 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.1 MiB

+--------------+-----------------------------------------------------------------------+----------+----------------------------+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------+---------------------------------------+----------+
|review_id     |review_body                                                            |product_id|product_title               |sentiment_label|idf_features                                                                                                                                                                                |rawPrediction                           |probability                            |prediction|
+--------------+-----------------------------------------------------------------------+----------+----------------------------+---------------+----

                                                                                

In [16]:
nb_predictions_apparel = apparel_model.transform(result_furniture)

In [32]:
nb_predictions_apparel.show(20)

23/03/04 06:34:18 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.1 MiB


+--------------+--------------------+----------+--------------------+---------------+--------------------+--------------------+--------------------+----------+
|     review_id|         review_body|product_id|       product_title|sentiment_label|        idf_features|       rawPrediction|         probability|prediction|
+--------------+--------------------+----------+--------------------+---------------+--------------------+--------------------+--------------------+----------+
|R3VR960AHLFKDV|This desk is very...|B004HB5E0E|Shoal Creek Compu...|            0.0|(262144,[34194,63...|[-344.84830744012...|[0.99999928057139...|       0.0|
|R16LGVMFKIUT0G|          Great item|B0042TNMMS|Dorel Home Produc...|            0.0|(262144,[245420,2...|[-34.246687107928...|[0.86196057237309...|       0.0|
|R1AIMEEPYHMOE4|Perfect fit for m...|B0030MPBZ4|Bathroom Vanity T...|            0.0|(262144,[3524,137...|[-268.18272503287...|[0.99998813289651...|       0.0|
|R1892CCSZWZ9SR|We use this on a ...|B00

In [33]:
nb_predictions_apparel.filter(col("product_id").like("B00LI4RJQ0")).filter(col("review_id").like("R1GJC1BP028XO9")).show(truncate=False)

23/03/04 06:35:12 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.1 MiB
23/03/04 06:35:14 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 8.1 MiB

+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+----------------------------------+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [31]:
test_furniture.filter(col("product_id").like("B005G02ESA")).filter(col("review_id").like("R1892CCSZWZ9SR")).select("review_id","product_id","star_rating","helpful_votes","total_votes","verified_purchase","review_body","sentiment").show()



+--------------+----------+-----------+-------------+-----------+-----------------+--------------------+---------+
|     review_id|product_id|star_rating|helpful_votes|total_votes|verified_purchase|         review_body|sentiment|
+--------------+----------+-----------+-------------+-----------+-----------------+--------------------+---------+
|R1892CCSZWZ9SR|B005G02ESA|          3|            0|          0|                Y|We use this on a ...| negative|
+--------------+----------+-----------+-------------+-----------+-----------------+--------------------+---------+



                                                                                

In [24]:
check=test_furniture.filter(col("product_id").like("B0042TNMMS"))

In [26]:
check.filter(col("review_id").like("R16LGVMFKIUT0G")).select("review_id","product_id","star_rating","helpful_votes","total_votes","verified_purchase","review_body","sentiment").show()



+--------------+----------+-----------+-------------+-----------+-----------------+-----------+---------+
|     review_id|product_id|star_rating|helpful_votes|total_votes|verified_purchase|review_body|sentiment|
+--------------+----------+-----------+-------------+-----------+-----------------+-----------+---------+
|R16LGVMFKIUT0G|B0042TNMMS|          5|            0|          0|                Y| Great item| positive|
+--------------+----------+-----------+-------------+-----------+-----------------+-----------+---------+



                                                                                

In [None]:
nb_predictions_apparel

In [None]:

# evaluator = MulticlassClassificationEvaluator(labelCol='sentiment_label', predictionCol="prediction", metricName="accuracy")
# nb_accuracy_apparel = evaluator.evaluate(nb_predictions_apparel)
# print("Accuracy = %g" % (nb_accuracy_apparel))

In [None]:
evaluate_df = spark.createDataFrame([], schema)