In [91]:
#http://jmcauley.ucsd.edu/data/amazon/

#spark-submit pythonfile.py
!pip install pyspark
#from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import VectorAssembler, Word2Vec
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.types import IntegerType, StringType
from pyspark.sql.functions import length,rand, udf
import re

!pip install Unidecode
import unidecode
spark=SparkSession.builder.appName('trabalho_disciplina').getOrCreate()
sc = spark.sparkContext



In [92]:
def remove_punctuations(texto):
  texto2 = ''.join(re.findall('\d{1}\.\d{1}|\d{1}\,\d{1}|\w|\s', str(texto)))
  texto3 = unidecode.unidecode(texto2)
  return texto3

In [93]:
df=spark.read.json('amazon.json')
df.show(5)

+----------+-----+-------+--------------------+----------+-------------+------------+--------------------+----------+--------------+--------+----+
|      asin|image|overall|          reviewText|reviewTime|   reviewerID|reviewerName|               style|   summary|unixReviewTime|verified|vote|
+----------+-----+-------+--------------------+----------+-------------+------------+--------------------+----------+--------------+--------+----+
|B000K2PJ4K| null|    5.0|Great product and...|09 4, 2015|ALJ66O1Y6SLHA|    Tonya B.|[ Blue/Orange,,  ...|Five Stars|    1441324800|    true|null|
|B000K2PJ4K| null|    5.0|Great product and...|09 4, 2015|ALJ66O1Y6SLHA|    Tonya B.|[ Black (37467610...|Five Stars|    1441324800|    true|null|
|B000K2PJ4K| null|    5.0|Great product and...|09 4, 2015|ALJ66O1Y6SLHA|    Tonya B.|[ Blue/Gray Logo,...|Five Stars|    1441324800|    true|null|
|B000K2PJ4K| null|    5.0|Great product and...|09 4, 2015|ALJ66O1Y6SLHA|    Tonya B.|[ Blue (37867638-...|Five Stars| 

In [94]:
print(df.count())
df.groupBy('overall').count().show()

3176
+-------+-----+
|overall|count|
+-------+-----+
|    1.0|  117|
|    4.0|  471|
|    3.0|  337|
|    2.0|   93|
|    5.0| 2158|
+-------+-----+



In [95]:
punct_udf=udf(remove_punctuations,StringType())
df2 = df.withColumn('punct',punct_udf(df['reviewText']))

In [96]:
text_df=df2.withColumn('length',length(df2['punct']))
text_df.groupBy('overall').agg({'Length':'mean'}).show()

+-------+------------------+
|overall|       avg(Length)|
+-------+------------------+
|    1.0|115.42735042735043|
|    4.0|170.87898089171975|
|    3.0| 172.8278931750742|
|    2.0|156.74193548387098|
|    5.0|109.66218721037998|
+-------+------------------+



In [97]:
tokenization = Tokenizer(inputCol='punct',outputCol='tokens')
tokenized_df = tokenization.transform(df2)
tokenized_df.show(4,False)

+----------+-----+-------+------------------------+----------+-------------+------------+--------------------------------------------+----------+--------------+--------+----+-----------------------+----------------------------+
|asin      |image|overall|reviewText              |reviewTime|reviewerID   |reviewerName|style                                       |summary   |unixReviewTime|verified|vote|punct                  |tokens                      |
+----------+-----+-------+------------------------+----------+-------------+------------+--------------------------------------------+----------+--------------+--------+----+-----------------------+----------------------------+
|B000K2PJ4K|null |5.0    |Great product and price!|09 4, 2015|ALJ66O1Y6SLHA|Tonya B.    |[ Blue/Orange,,  Big Boys,]                 |Five Stars|1441324800    |true    |null|Great product and price|[great, product, and, price]|
|B000K2PJ4K|null |5.0    |Great product and price!|09 4, 2015|ALJ66O1Y6SLHA|Tonya B.    

In [98]:
from pyspark.ml.feature import StopWordsRemover

stopword_removal=StopWordsRemover(inputCol='tokens', outputCol='refined_tokens')
refined_df=stopword_removal.transform(tokenized_df)
refined_df.select('overall','refined_tokens').show(4,False)

+-------+-----------------------+
|overall|refined_tokens         |
+-------+-----------------------+
|5.0    |[great, product, price]|
|5.0    |[great, product, price]|
|5.0    |[great, product, price]|
|5.0    |[great, product, price]|
+-------+-----------------------+
only showing top 4 rows



In [99]:
'''
from pyspark.ml.feature import CountVectorizer

count_vec=CountVectorizer(inputCol='refined_tokens', outputCol='features')
cv_df = count_vec.fit(refined_df).transform(refined_df)
cv_df.select(['refined_tokens','features']).show(4,False)
'''

"\nfrom pyspark.ml.feature import CountVectorizer\n\ncount_vec=CountVectorizer(inputCol='refined_tokens', outputCol='features')\ncv_df = count_vec.fit(refined_df).transform(refined_df)\ncv_df.select(['refined_tokens','features']).show(4,False)\n"

In [100]:
from pyspark.ml.feature import HashingTF,IDF
hashing_vec=HashingTF(inputCol='refined_tokens',outputCol='tf_features')
hashing_df=hashing_vec.transform(refined_df)
hashing_df.select(['refined_tokens','tf_features']).show(4,False)

+-----------------------+--------------------------------------------+
|refined_tokens         |tf_features                                 |
+-----------------------+--------------------------------------------+
|[great, product, price]|(262144,[52879,201386,261870],[1.0,1.0,1.0])|
|[great, product, price]|(262144,[52879,201386,261870],[1.0,1.0,1.0])|
|[great, product, price]|(262144,[52879,201386,261870],[1.0,1.0,1.0])|
|[great, product, price]|(262144,[52879,201386,261870],[1.0,1.0,1.0])|
+-----------------------+--------------------------------------------+
only showing top 4 rows



In [101]:
word2Vec = Word2Vec(vectorSize=30, seed=42, inputCol="refined_tokens", outputCol="tf_features")
model = word2Vec.fit(refined_df)
model.getVectors().show(4)

+----------+--------------------+
|      word|              vector|
+----------+--------------------+
|purchasing|[-0.0202325936406...|
|     looks|[-0.0779481828212...|
|      used|[-0.1602054089307...|
|   aerobic|[-0.0112498449161...|
+----------+--------------------+
only showing top 4 rows



In [102]:
word2vec_df = model.transform(refined_df)
word2vec_df.select(['refined_tokens','tf_features']).show(4,False)

+-----------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|refined_tokens         |tf_features                                                                                                                                                                                                                                                                                                                        

In [103]:
df_assembler = VectorAssembler(inputCols=['tf_features'],outputCol='features')

model_text_df = df_assembler.transform(hashing_df)
model_text_df2 = df_assembler.transform(word2vec_df)

to_model1 = model_text_df.withColumn('overall_int',model_text_df['overall'].cast(IntegerType()))
to_model2 = model_text_df2.withColumn('overall_int',model_text_df2['overall'].cast(IntegerType()))

training_df1, test_df1 = to_model1.randomSplit([0.75,0.25])
training_df2, test_df2 = to_model2.randomSplit([0.75,0.25])

In [104]:
NB_clf = NaiveBayes(labelCol='overall_int').fit(training_df1)
RF_clf = RandomForestClassifier(numTrees=10, labelCol="overall_int").fit(training_df2)

NB_predictions = NB_clf.transform(test_df1)
RF_predictions = RF_clf.transform(test_df2)

In [105]:
NB_accuracy = MulticlassClassificationEvaluator(labelCol='overall',metricName='accuracy').evaluate(NB_predictions)
print('The accuracy of NB on test data is {0:.0%}'.format(NB_accuracy))
NB_accuracy = MulticlassClassificationEvaluator(labelCol='overall',metricName='weightedPrecision').evaluate(NB_predictions)
print('The precision rate of NB on test data is {0:.0%}'.format(NB_accuracy))

The accuracy of NB on test data is 12%
The precision rate of NB on test data is 3%


In [106]:
RF_accuracy = MulticlassClassificationEvaluator(labelCol='overall',metricName='accuracy').evaluate(RF_predictions)
print('The accuracy of RF on test data is {0:.0%}'.format(RF_accuracy))
RF_accuracy = MulticlassClassificationEvaluator(labelCol='overall',metricName='weightedPrecision').evaluate(RF_predictions)
print('The precision rate of RF on test data is {0:.0%}'.format(RF_accuracy))

The accuracy of RF on test data is 77%
The precision rate of RF on test data is 78%
